You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.1 KiB

import fs from 'node:fs/promises';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import puppeteer from "puppeteer";
// archive of saved PWM docs techmical drowings and files
const TARGET_URL = "https://web.archive.org/web/*/https://www.paragonmachineworks.com/files/public-docs/*";
const SEARCH_INPUT_LOCATOR = "input[type=search]";
const OUT = './pdfs';
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(TARGET_URL);
// PART ONE -- download single PDF's
await page.locator(SEARCH_INPUT_LOCATOR).fill('pdf');
await page.waitForSelector('td.url', {timeout: 0});
const rows = await page.$$eval("tr", (rows) => rows.map(row => {
const link = row.querySelector("a");
const url = link?.href;
const captures = row.getElementsByClassName("captures")[0]?.textContent;
const uniques = row.getElementsByClassName("uniques")[0]?.textContent;
const capturesTime = captures ? parseInt(captures) : 0;
const uniquesTime = uniques ? parseInt(uniques) : 0;
return capturesTime == 1 & uniquesTime == 1 ? url : null;
}).filter(url => url));
async function downloadFiles(urls, outPath) {
try {
// Создаем папку, если её еще нет
await fs.mkdir(outPath, { recursive: true });
for (const url of urls) {
const fullUrl = url.replace(/(\/web\/\d+)/, '$1id_');
const response = await fetch(fullUrl);
if (!response.ok) {
console.error(`Ошибка при загрузке ${fullUrl}: ${response.statusText}`);
continue;
}
// Получаем имя файла из URL
const fileName = path.basename(new URL(url).pathname);
const destination = path.join(outPath, fileName);
// Записываем файл на диск
const arrayBuffer = await response.arrayBuffer();
await fs.writeFile(destination, Buffer.from(arrayBuffer));
console.log(`✅ Сохранено: ${fileName}`);
}
} catch (error) {
console.error('Критическая ошибка:', error.message);
}
}
await downloadFiles(rows, OUT);
browser.close();