import fs from 'node:fs/promises'; import path from 'node:path'; import { Buffer } from 'node:buffer'; import puppeteer from "puppeteer"; // archive of saved PWM docs techmical drowings and files const TARGET_URL = "https://web.archive.org/web/*/https://www.paragonmachineworks.com/files/public-docs/*"; const SEARCH_INPUT_LOCATOR = "input[type=search]"; const OUT = './pdfs'; const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(TARGET_URL); // PART ONE -- download single PDF's await page.locator(SEARCH_INPUT_LOCATOR).fill('pdf'); await page.waitForSelector('td.url', {timeout: 0}); const rows = await page.$$eval("tr", (rows) => rows.map(row => { const link = row.querySelector("a"); const url = link?.href; const captures = row.getElementsByClassName("captures")[0]?.textContent; const uniques = row.getElementsByClassName("uniques")[0]?.textContent; const capturesTime = captures ? parseInt(captures) : 0; const uniquesTime = uniques ? parseInt(uniques) : 0; return capturesTime == 1 & uniquesTime == 1 ? url : null; }).filter(url => url)); async function downloadFiles(urls, outPath) { try { // Создаем папку, если её еще нет await fs.mkdir(outPath, { recursive: true }); for (const url of urls) { const fullUrl = url.replace(/(\/web\/\d+)/, '$1id_'); const response = await fetch(fullUrl); if (!response.ok) { console.error(`Ошибка при загрузке ${fullUrl}: ${response.statusText}`); continue; } // Получаем имя файла из URL const fileName = path.basename(new URL(url).pathname); const destination = path.join(outPath, fileName); // Записываем файл на диск const arrayBuffer = await response.arrayBuffer(); await fs.writeFile(destination, Buffer.from(arrayBuffer)); console.log(`✅ Сохранено: ${fileName}`); } } catch (error) { console.error('Критическая ошибка:', error.message); } } await downloadFiles(rows, OUT); browser.close();