Парсинг базовых документов

main
lambda 3 weeks ago
parent 19115fa172
commit b4a67cb9bf
No known key found for this signature in database
GPG Key ID: CF45A8EE158BBCB6

@ -0,0 +1,64 @@
import fs from 'node:fs/promises';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import puppeteer from "puppeteer";
// archive of saved PWM docs techmical drowings and files
const TARGET_URL = "https://web.archive.org/web/*/https://www.paragonmachineworks.com/files/public-docs/*";
const SEARCH_INPUT_LOCATOR = "input[type=search]";
const OUT = './pdfs';
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(TARGET_URL);
// PART ONE -- download single PDF's
await page.locator(SEARCH_INPUT_LOCATOR).fill('pdf');
await page.waitForSelector('td.url', {timeout: 0});
const rows = await page.$$eval("tr", (rows) => rows.map(row => {
const link = row.querySelector("a");
const url = link?.href;
const captures = row.getElementsByClassName("captures")[0]?.textContent;
const uniques = row.getElementsByClassName("uniques")[0]?.textContent;
const capturesTime = captures ? parseInt(captures) : 0;
const uniquesTime = uniques ? parseInt(uniques) : 0;
return capturesTime == 1 & uniquesTime == 1 ? url : null;
}).filter(url => url));
async function downloadFiles(urls, outPath) {
try {
// Создаем папку, если её еще нет
await fs.mkdir(outPath, { recursive: true });
for (const url of urls) {
const fullUrl = url.replace(/(\/web\/\d+)/, '$1id_');
const response = await fetch(fullUrl);
if (!response.ok) {
console.error(`Ошибка при загрузке ${fullUrl}: ${response.statusText}`);
continue;
}
// Получаем имя файла из URL
const fileName = path.basename(new URL(url).pathname);
const destination = path.join(outPath, fileName);
// Записываем файл на диск
const arrayBuffer = await response.arrayBuffer();
await fs.writeFile(destination, Buffer.from(arrayBuffer));
console.log(`✅ Сохранено: ${fileName}`);
}
} catch (error) {
console.error('Критическая ошибка:', error.message);
}
}
await downloadFiles(rows, OUT);
browser.close();

1137
package-lock.json generated

File diff suppressed because it is too large Load Diff

@ -0,0 +1,19 @@
{
"name": "paragon_docs_saver",
"version": "1.0.0",
"description": "As Paragon Machine Works closed i need to save publicly available copies of echnical drawings and other documentation.",
"repository": {
"type": "git",
"url": "ssh://git@gitea.bjornmossa.net/Cyclocrust/paragon_docs_saver.git"
},
"license": "ISC",
"author": "",
"type": "module",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"dependencies": {
"puppeteer": "^24.40.0"
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.
Loading…
Cancel
Save