Adding retry to bandcamp scraping.

This commit is contained in:
Bradley Shellnut 2025-08-24 23:02:31 -07:00
parent 60f53e84ed
commit 8735da795a
2 changed files with 81 additions and 76 deletions

View file

@ -1,51 +1,51 @@
import { json } from '@sveltejs/kit'; import { json } from '@sveltejs/kit';
import type { ArticlePageLoad } from '@/lib/types/article.js';
import { PAGE_SIZE } from '$env/static/private'; import { PAGE_SIZE } from '$env/static/private';
import { fetchArticlesApi } from '$lib/services/articlesApi'; import { fetchArticlesApi } from '$lib/services/articlesApi';
import type { ArticlePageLoad } from '@/lib/types/article.js';
export async function GET({ setHeaders, url }) { export async function GET({ setHeaders, url }) {
const page = url?.searchParams?.get('page') || '1'; const page = url?.searchParams?.get('page') || '1';
let limit = url?.searchParams?.get('limit') ?? PAGE_SIZE; let limit = url?.searchParams?.get('limit') ?? PAGE_SIZE;
if (Number(limit) > 30) { if (Number(limit) > 30) {
limit = PAGE_SIZE; limit = PAGE_SIZE;
} }
try { try {
const response: ArticlePageLoad = await fetchArticlesApi('get', 'fetchArticles', { const response: ArticlePageLoad = await fetchArticlesApi('get', 'fetchArticles', {
page, page,
limit limit,
}); });
if (response?.articles) { if (response?.articles) {
if (response?.cacheControl) { if (response?.cacheControl) {
if (!response.cacheControl.includes('no-cache')) { if (!response.cacheControl.includes('no-cache')) {
setHeaders({ setHeaders({
'cache-control': response?.cacheControl 'cache-control': response?.cacheControl,
}); });
} else { } else {
setHeaders({ setHeaders({
'cache-control': 'max-age=43200' 'cache-control': 'max-age=43200',
}); });
} }
} }
return json(response); return json(response);
} }
} catch (e) { } catch (e) {
console.error(e); console.error(e);
// Fall back to an empty, cacheable payload so pages can still render in E2E // Fall back to an empty, cacheable payload so pages can still render in E2E
const fallback: ArticlePageLoad = { const fallback: ArticlePageLoad = {
articles: [], articles: [],
currentPage: Number(page) || 1, currentPage: Number(page) || 1,
totalArticles: 0, totalArticles: 0,
totalPages: 1, totalPages: 1,
limit: Number(limit) || 10, limit: Number(limit) || 10,
cacheControl: 'no-cache' cacheControl: 'no-cache',
} as unknown as ArticlePageLoad; } as unknown as ArticlePageLoad;
return json(fallback, { return json(fallback, {
headers: { headers: {
'cache-control': 'no-cache' 'cache-control': 'no-cache',
} },
}); });
} }
}; }

View file

@ -1,64 +1,69 @@
import { json, error } from '@sveltejs/kit'; import { json } from '@sveltejs/kit';
import scrapeIt, { type ScrapeResult } from 'scrape-it';
import { BANDCAMP_USERNAME, USE_REDIS_CACHE } from '$env/static/private'; import { BANDCAMP_USERNAME, USE_REDIS_CACHE } from '$env/static/private';
import { redis } from '$lib/server/redis'; import { redis } from '$lib/server/redis';
import type { Album, BandCampResults } from '$lib/types/album'; import type { Album, BandCampResults } from '$lib/types/album';
import scrapeIt, { type ScrapeResult } from 'scrape-it';
export async function GET({ setHeaders, url }) { async function retryWithBackoff<T>(fn: () => Promise<T>, maxRetries = 3, baseDelay = 500): Promise<T> {
let lastError: Error | undefined;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (err) {
lastError = err as Error;
if (attempt === maxRetries) break;
const delay = baseDelay * 2 ** attempt; // 500ms, 1s, 2s
await new Promise((r) => setTimeout(r, delay));
}
}
throw lastError;
}
export async function GET({ setHeaders }) {
try { try {
if (USE_REDIS_CACHE === 'true') { if (USE_REDIS_CACHE === 'true') {
const cached: string | null = await redis.get('bandcampAlbums'); const cached: string | null = await redis.get('bandcampAlbums');
if (cached) { if (cached) {
const response: Album[] = JSON.parse(cached); const response: Album[] = JSON.parse(cached);
const ttl = await redis.ttl("bandcampAlbums"); const ttl = await redis.ttl('bandcampAlbums');
if (ttl) { if (ttl) {
setHeaders({ setHeaders({
"cache-control": `max-age=${ttl}`, 'cache-control': `max-age=${ttl}`,
}); });
} else { } else {
setHeaders({ setHeaders({
"cache-control": "max-age=43200", 'cache-control': 'max-age=43200',
}); });
} }
return json(response); return json(response);
} }
} }
const { data }: ScrapeResult<BandCampResults> = await scrapeIt(`https://bandcamp.com/${BANDCAMP_USERNAME}`, { // Scrape Bandcamp with realistic headers, plus retry/backoff
collectionItems: { const { data }: ScrapeResult<BandCampResults> = await retryWithBackoff(async () =>
listItem: '.collection-item-container', await scrapeIt(`https://bandcamp.com/${BANDCAMP_USERNAME}`, {
data: { collectionItems: {
url: { listItem: '.collection-item-container',
selector: '.collection-title-details > a.item-link', data: {
attr: 'href', url: { selector: '.collection-title-details > a.item-link', attr: 'href' },
}, artwork: { selector: 'div.collection-item-art-container a img', attr: 'src' },
artwork: { title: { selector: 'span.item-link-alt > div.collection-item-title' },
selector: 'div.collection-item-art-container a img', artist: { selector: 'span.item-link-alt > div.collection-item-artist' },
attr: 'src',
},
title: {
selector: 'span.item-link-alt > div.collection-item-title',
},
artist: {
selector: 'span.item-link-alt > div.collection-item-artist',
}, },
}, },
}, })
}); );
const albums: Album[] = data?.collectionItems || []; const albums: Album[] = data?.collectionItems || [];
if (albums && albums.length > 0) {
if (albums && albums?.length > 0) {
if (USE_REDIS_CACHE === 'true') { if (USE_REDIS_CACHE === 'true') {
redis.set('bandcampAlbums', JSON.stringify(albums), 'EX', 43200); redis.set('bandcampAlbums', JSON.stringify(albums), 'EX', 43200);
} }
setHeaders({ setHeaders({ 'cache-control': 'max-age=43200' });
"cache-control": "max-age=43200",
});
return json(albums); return json(albums);
} }
return json([]); return json([]);
} catch (error) { } catch (error) {
console.error(error); console.error(error);
return json([]); return json([]);