162 lines
6.5 KiB
JavaScript
162 lines
6.5 KiB
JavaScript
const https = require("https");
|
|
const fs = require("fs");
|
|
const parser = require('node-html-parser');
|
|
const { convert } = require('html-to-text');
|
|
const options = {
|
|
wordwrap: 130,
|
|
blockTextElements: {
|
|
script: false
|
|
},
|
|
selectors: [
|
|
{
|
|
selector: 'hr',
|
|
format: 'block'
|
|
},
|
|
]
|
|
};
|
|
const url = (rid, page) => "https://chat.luvul.net/?action=PastLogList&pageFrom=" + page + "&room_id=" + rid
|
|
const errot_callback = (e) => {
|
|
console.error(`Got error: ${e.message}`);
|
|
}
|
|
|
|
const RATE_LIMIT_COUNT = 10; // 何件ごとに待機するか
|
|
const RATE_LIMIT_DELAY = 30000; // 待機時間(ms)
|
|
|
|
// グローバル取得カウンター(全部屋共通)
|
|
let globalFetchCount = 0;
|
|
|
|
function fetchLogFile(path, name, onDone) {
|
|
https.get("https://chat.luvul.net" + path, (response) => {
|
|
if (response.statusCode !== 200) {
|
|
console.error(`Request failed (status code: ${response.statusCode})`);
|
|
response.resume();
|
|
onDone();
|
|
return;
|
|
}
|
|
response.setEncoding("utf8");
|
|
let rawData = '';
|
|
response.on('data', (chunk) => { rawData += chunk; });
|
|
response.on('end', () => {
|
|
rawData = rawData.replaceAll('href="/style/', 'href="../style/')
|
|
let dom = parser.parse(rawData);
|
|
dom.querySelectorAll('iframe').forEach(x => x.remove());
|
|
dom.querySelectorAll('img').forEach(x => x.remove());
|
|
dom.querySelectorAll('script').forEach(x => x.remove());
|
|
dom.querySelectorAll('link[href]').forEach(el => {
|
|
const href = el.getAttribute('href');
|
|
if (href) el.setAttribute('href', href.replace(/\?.*$/, ''));
|
|
});
|
|
fs.writeFileSync(name + ".html", dom.toString())
|
|
dom.querySelectorAll('hr').forEach(x => x.remove());
|
|
fs.writeFileSync(name + ".txt", convert(dom.toString(), options))
|
|
globalFetchCount++;
|
|
console.log(`[${globalFetchCount}] ${name}`)
|
|
onDone();
|
|
})
|
|
}).on('error', errot_callback);
|
|
}
|
|
|
|
function processQueue(queue, onAllDone) {
|
|
if (queue.length === 0) { onAllDone(); return; }
|
|
const { path, name } = queue.shift();
|
|
fetchLogFile(path, name, () => {
|
|
if (globalFetchCount % RATE_LIMIT_COUNT === 0) {
|
|
console.log(`${globalFetchCount}件取得済み、${RATE_LIMIT_DELAY / 1000}秒待機...`)
|
|
setTimeout(() => processQueue(queue, onAllDone), RATE_LIMIT_DELAY)
|
|
} else {
|
|
processQueue(queue, onAllDone)
|
|
}
|
|
})
|
|
}
|
|
|
|
function processRoom(room_id, doneCallback, fullMode) {
|
|
let page = 0
|
|
let newestExisting = null // 部屋ディレクトリ内の最新既存ファイル名(拡張子なし)、初回ページで確定
|
|
const callback = (response) => {
|
|
if (response.statusCode !== 200) {
|
|
console.error(`Request failed (status code: ${response.statusCode})`);
|
|
response.resume();
|
|
if (doneCallback) doneCallback();
|
|
return;
|
|
}
|
|
response.setEncoding("utf8");
|
|
let rawData = '';
|
|
response.on('data', (chunk) => { rawData += chunk; });
|
|
response.on('end', () => {
|
|
var title = "[" + room_id + "]" + rawData.match(/<title>「(.+?)」の過去ログ一覧/)[1]
|
|
var m = rawData.match(/<a href=".+?">.+?の過去ログ/gim)
|
|
if (!m) m = []
|
|
if (!fs.existsSync(title) || !fs.statSync(title).isDirectory()) {
|
|
fs.mkdirSync(title)
|
|
}
|
|
// 初回ページのみ: ディレクトリ内の既存HTMLから最新ファイルを特定
|
|
if (!fullMode && newestExisting === null) {
|
|
const existingFiles = fs.readdirSync(title)
|
|
.filter(f => f.endsWith('.html'))
|
|
.sort()
|
|
newestExisting = existingFiles.length > 0
|
|
? title + '/' + existingFiles[existingFiles.length - 1].replace(/\.html$/, '')
|
|
: ''
|
|
if (newestExisting) console.log(`newest existing: ${newestExisting}`)
|
|
}
|
|
const queue = []
|
|
m.forEach(l => {
|
|
l = l.replace(/&/g, "&")
|
|
var name = title + "/" + l.match(/>(.+)/)[1].replace(/[\/:]/g, "")
|
|
var path = l.match(/<a href="(.+?)">/)[1]
|
|
if (!fullMode && fs.existsSync(name + ".html") && name !== newestExisting) {
|
|
console.log('skip: ' + name)
|
|
return
|
|
}
|
|
queue.push({ path, name })
|
|
})
|
|
console.log(`room ${room_id} page ${page}: ${queue.length}件取得、${m.length - queue.length}件スキップ`)
|
|
page += 100
|
|
const hasNextPage = m.length >= 100
|
|
processQueue(queue, () => {
|
|
if (hasNextPage) {
|
|
https.get(url(room_id, page), callback).on('error', errot_callback)
|
|
} else {
|
|
if (doneCallback) doneCallback();
|
|
}
|
|
})
|
|
});
|
|
}
|
|
https.get(url(room_id, page), callback).on('error', errot_callback);
|
|
}
|
|
|
|
const args = process.argv.slice(2);
|
|
const fullMode = args.includes('--full');
|
|
const room_id = args.find(a => !a.startsWith('-'));
|
|
|
|
module.exports = {
|
|
processRoom,
|
|
}
|
|
|
|
if (require.main === module) {
|
|
if (room_id) {
|
|
console.log(room_id + (fullMode ? ' [full mode]' : ' [skip existing]'))
|
|
processRoom(room_id, null, fullMode)
|
|
} else {
|
|
// 既存のディレクトリからルームIDを収集して順番に処理する
|
|
const dirs = fs.readdirSync('.').filter(f => {
|
|
try { return fs.statSync(f).isDirectory() && /^\[\d+\]/.test(f); }
|
|
catch (e) { return false; }
|
|
});
|
|
const roomIds = [...new Set(dirs.map(d => d.match(/^\[(\d+)\]/)[1]))];
|
|
console.log('Found rooms: ' + roomIds.join(', ') + (fullMode ? ' [full mode]' : ' [skip existing]'));
|
|
|
|
let i = 0;
|
|
function next() {
|
|
if (i >= roomIds.length) {
|
|
console.log('All rooms processed');
|
|
return;
|
|
}
|
|
const rid = roomIds[i++];
|
|
console.log(`Processing room ${rid} (${i}/${roomIds.length})`);
|
|
processRoom(rid, () => setTimeout(next, 5000), fullMode);
|
|
}
|
|
next();
|
|
}
|
|
}
|