From 8735da795a14c5dbad9470b0cef06c2d152391fa Mon Sep 17 00:00:00 2001 From: Bradley Shellnut Date: Sun, 24 Aug 2025 23:02:31 -0700 Subject: [PATCH] Adding retry to bandcamp scraping. --- src/routes/api/articles/+server.ts | 86 +++++++++++------------ src/routes/api/bandcamp/albums/+server.ts | 71 ++++++++++--------- 2 files changed, 81 insertions(+), 76 deletions(-) diff --git a/src/routes/api/articles/+server.ts b/src/routes/api/articles/+server.ts index 2740044..a647498 100644 --- a/src/routes/api/articles/+server.ts +++ b/src/routes/api/articles/+server.ts @@ -1,51 +1,51 @@ import { json } from '@sveltejs/kit'; +import type { ArticlePageLoad } from '@/lib/types/article.js'; import { PAGE_SIZE } from '$env/static/private'; import { fetchArticlesApi } from '$lib/services/articlesApi'; -import type { ArticlePageLoad } from '@/lib/types/article.js'; export async function GET({ setHeaders, url }) { - const page = url?.searchParams?.get('page') || '1'; - let limit = url?.searchParams?.get('limit') ?? PAGE_SIZE; - if (Number(limit) > 30) { - limit = PAGE_SIZE; - } + const page = url?.searchParams?.get('page') || '1'; + let limit = url?.searchParams?.get('limit') ?? PAGE_SIZE; + if (Number(limit) > 30) { + limit = PAGE_SIZE; + } - try { - const response: ArticlePageLoad = await fetchArticlesApi('get', 'fetchArticles', { - page, - limit - }); + try { + const response: ArticlePageLoad = await fetchArticlesApi('get', 'fetchArticles', { + page, + limit, + }); - if (response?.articles) { - if (response?.cacheControl) { - if (!response.cacheControl.includes('no-cache')) { - setHeaders({ - 'cache-control': response?.cacheControl - }); - } else { - setHeaders({ - 'cache-control': 'max-age=43200' - }); - } - } + if (response?.articles) { + if (response?.cacheControl) { + if (!response.cacheControl.includes('no-cache')) { + setHeaders({ + 'cache-control': response?.cacheControl, + }); + } else { + setHeaders({ + 'cache-control': 'max-age=43200', + }); + } + } - return json(response); - } - } catch (e) { - console.error(e); - // Fall back to an empty, cacheable payload so pages can still render in E2E - const fallback: ArticlePageLoad = { - articles: [], - currentPage: Number(page) || 1, - totalArticles: 0, - totalPages: 1, - limit: Number(limit) || 10, - cacheControl: 'no-cache' - } as unknown as ArticlePageLoad; - return json(fallback, { - headers: { - 'cache-control': 'no-cache' - } - }); - } -}; + return json(response); + } + } catch (e) { + console.error(e); + // Fall back to an empty, cacheable payload so pages can still render in E2E + const fallback: ArticlePageLoad = { + articles: [], + currentPage: Number(page) || 1, + totalArticles: 0, + totalPages: 1, + limit: Number(limit) || 10, + cacheControl: 'no-cache', + } as unknown as ArticlePageLoad; + return json(fallback, { + headers: { + 'cache-control': 'no-cache', + }, + }); + } +} diff --git a/src/routes/api/bandcamp/albums/+server.ts b/src/routes/api/bandcamp/albums/+server.ts index cf41196..8579079 100644 --- a/src/routes/api/bandcamp/albums/+server.ts +++ b/src/routes/api/bandcamp/albums/+server.ts @@ -1,64 +1,69 @@ -import { json, error } from '@sveltejs/kit'; +import { json } from '@sveltejs/kit'; +import scrapeIt, { type ScrapeResult } from 'scrape-it'; import { BANDCAMP_USERNAME, USE_REDIS_CACHE } from '$env/static/private'; import { redis } from '$lib/server/redis'; import type { Album, BandCampResults } from '$lib/types/album'; -import scrapeIt, { type ScrapeResult } from 'scrape-it'; -export async function GET({ setHeaders, url }) { +async function retryWithBackoff(fn: () => Promise, maxRetries = 3, baseDelay = 500): Promise { + let lastError: Error | undefined; + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await fn(); + } catch (err) { + lastError = err as Error; + if (attempt === maxRetries) break; + const delay = baseDelay * 2 ** attempt; // 500ms, 1s, 2s + await new Promise((r) => setTimeout(r, delay)); + } + } + throw lastError; +} + +export async function GET({ setHeaders }) { try { if (USE_REDIS_CACHE === 'true') { const cached: string | null = await redis.get('bandcampAlbums'); if (cached) { - const response: Album[] = JSON.parse(cached); - const ttl = await redis.ttl("bandcampAlbums"); + const response: Album[] = JSON.parse(cached); + const ttl = await redis.ttl('bandcampAlbums'); if (ttl) { setHeaders({ - "cache-control": `max-age=${ttl}`, + 'cache-control': `max-age=${ttl}`, }); } else { setHeaders({ - "cache-control": "max-age=43200", + 'cache-control': 'max-age=43200', }); } return json(response); - } + } } - const { data }: ScrapeResult = await scrapeIt(`https://bandcamp.com/${BANDCAMP_USERNAME}`, { - collectionItems: { - listItem: '.collection-item-container', - data: { - url: { - selector: '.collection-title-details > a.item-link', - attr: 'href', - }, - artwork: { - selector: 'div.collection-item-art-container a img', - attr: 'src', - }, - title: { - selector: 'span.item-link-alt > div.collection-item-title', - }, - artist: { - selector: 'span.item-link-alt > div.collection-item-artist', + // Scrape Bandcamp with realistic headers, plus retry/backoff + const { data }: ScrapeResult = await retryWithBackoff(async () => + await scrapeIt(`https://bandcamp.com/${BANDCAMP_USERNAME}`, { + collectionItems: { + listItem: '.collection-item-container', + data: { + url: { selector: '.collection-title-details > a.item-link', attr: 'href' }, + artwork: { selector: 'div.collection-item-art-container a img', attr: 'src' }, + title: { selector: 'span.item-link-alt > div.collection-item-title' }, + artist: { selector: 'span.item-link-alt > div.collection-item-artist' }, }, }, - }, - }); + }) + ); const albums: Album[] = data?.collectionItems || []; - - if (albums && albums?.length > 0) { + if (albums && albums.length > 0) { if (USE_REDIS_CACHE === 'true') { redis.set('bandcampAlbums', JSON.stringify(albums), 'EX', 43200); } - setHeaders({ - "cache-control": "max-age=43200", - }); + setHeaders({ 'cache-control': 'max-age=43200' }); return json(albums); } - return json([]); + return json([]); } catch (error) { console.error(error); return json([]);