parent
19115fa172
commit
b4a67cb9bf
@ -0,0 +1,64 @@
|
|||||||
|
import fs from 'node:fs/promises';
|
||||||
|
import path from 'node:path';
|
||||||
|
import { Buffer } from 'node:buffer';
|
||||||
|
import puppeteer from "puppeteer";
|
||||||
|
|
||||||
|
// archive of saved PWM docs techmical drowings and files
|
||||||
|
const TARGET_URL = "https://web.archive.org/web/*/https://www.paragonmachineworks.com/files/public-docs/*";
|
||||||
|
const SEARCH_INPUT_LOCATOR = "input[type=search]";
|
||||||
|
const OUT = './pdfs';
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch();
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto(TARGET_URL);
|
||||||
|
|
||||||
|
// PART ONE -- download single PDF's
|
||||||
|
await page.locator(SEARCH_INPUT_LOCATOR).fill('pdf');
|
||||||
|
await page.waitForSelector('td.url', {timeout: 0});
|
||||||
|
|
||||||
|
const rows = await page.$$eval("tr", (rows) => rows.map(row => {
|
||||||
|
const link = row.querySelector("a");
|
||||||
|
const url = link?.href;
|
||||||
|
|
||||||
|
const captures = row.getElementsByClassName("captures")[0]?.textContent;
|
||||||
|
const uniques = row.getElementsByClassName("uniques")[0]?.textContent;
|
||||||
|
|
||||||
|
const capturesTime = captures ? parseInt(captures) : 0;
|
||||||
|
const uniquesTime = uniques ? parseInt(uniques) : 0;
|
||||||
|
|
||||||
|
return capturesTime == 1 & uniquesTime == 1 ? url : null;
|
||||||
|
}).filter(url => url));
|
||||||
|
|
||||||
|
async function downloadFiles(urls, outPath) {
|
||||||
|
try {
|
||||||
|
// Создаем папку, если её еще нет
|
||||||
|
await fs.mkdir(outPath, { recursive: true });
|
||||||
|
|
||||||
|
for (const url of urls) {
|
||||||
|
const fullUrl = url.replace(/(\/web\/\d+)/, '$1id_');
|
||||||
|
const response = await fetch(fullUrl);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
console.error(`Ошибка при загрузке ${fullUrl}: ${response.statusText}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Получаем имя файла из URL
|
||||||
|
const fileName = path.basename(new URL(url).pathname);
|
||||||
|
const destination = path.join(outPath, fileName);
|
||||||
|
|
||||||
|
// Записываем файл на диск
|
||||||
|
const arrayBuffer = await response.arrayBuffer();
|
||||||
|
await fs.writeFile(destination, Buffer.from(arrayBuffer));
|
||||||
|
|
||||||
|
console.log(`✅ Сохранено: ${fileName}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Критическая ошибка:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await downloadFiles(rows, OUT);
|
||||||
|
browser.close();
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"name": "paragon_docs_saver",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "As Paragon Machine Works closed i need to save publicly available copies of echnical drawings and other documentation.",
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "ssh://git@gitea.bjornmossa.net/Cyclocrust/paragon_docs_saver.git"
|
||||||
|
},
|
||||||
|
"license": "ISC",
|
||||||
|
"author": "",
|
||||||
|
"type": "module",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"puppeteer": "^24.40.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue