324 lines
9.0 KiB
JavaScript
324 lines
9.0 KiB
JavaScript
const https = require('https');
|
|
const parser = require('node-html-parser');
|
|
const { processRoom } = require('./index');
|
|
|
|
const SEARCH_PAGE_SIZE = 20;
|
|
const PAST_LOG_PAGE_SIZE = 100;
|
|
const ANALYZE_DELAY_MS = 400;
|
|
const ANALYZE_CONCURRENCY = 6;
|
|
const CRAWL_ROOM_DELAY_MS = 5000;
|
|
|
|
function sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function escapeRegExp(value) {
|
|
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
}
|
|
|
|
function decodeHtml(value) {
|
|
return value
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'");
|
|
}
|
|
|
|
function normalizePastLogTitle(value) {
|
|
return value
|
|
.replace(/^「/, '')
|
|
.replace(/」の過去ログ一覧$/, '')
|
|
.trim();
|
|
}
|
|
|
|
function fetchText(targetUrl) {
|
|
return new Promise((resolve, reject) => {
|
|
https.get(targetUrl, response => {
|
|
if (response.statusCode !== 200) {
|
|
response.resume();
|
|
reject(new Error(`Request failed (${response.statusCode}) for ${targetUrl}`));
|
|
return;
|
|
}
|
|
|
|
response.setEncoding('utf8');
|
|
let rawData = '';
|
|
response.on('data', chunk => {
|
|
rawData += chunk;
|
|
});
|
|
response.on('end', () => resolve(rawData));
|
|
}).on('error', reject);
|
|
});
|
|
}
|
|
|
|
function buildSearchUrl(keyword, pageFrom) {
|
|
const params = new URLSearchParams({
|
|
action: 'Index',
|
|
search: keyword,
|
|
ie: 'UTF-8',
|
|
});
|
|
|
|
if (pageFrom > 0) {
|
|
params.set('pageFrom', String(pageFrom));
|
|
}
|
|
|
|
return `https://chat.luvul.net/?${params.toString()}#freerooms`;
|
|
}
|
|
|
|
function buildPastLogUrl(roomId, pageFrom) {
|
|
const params = new URLSearchParams({
|
|
action: 'PastLogList',
|
|
room_id: String(roomId),
|
|
pageFrom: String(pageFrom),
|
|
});
|
|
return `https://chat.luvul.net/?${params.toString()}`;
|
|
}
|
|
|
|
function parseSearchResults(html, keyword) {
|
|
const dom = parser.parse(html);
|
|
const roomMap = new Map();
|
|
|
|
for (const line of dom.querySelectorAll('div.freeroomlistline')) {
|
|
const link = line.querySelector('a[href*="/ChatRoom?room_id="]');
|
|
if (!link) {
|
|
continue;
|
|
}
|
|
|
|
const href = link.getAttribute('href') || '';
|
|
const roomIdMatch = href.match(/room_id=(\d+)/);
|
|
if (!roomIdMatch) {
|
|
continue;
|
|
}
|
|
|
|
const roomId = roomIdMatch[1];
|
|
const title = decodeHtml(link.text.trim());
|
|
|
|
if (!roomMap.has(roomId)) {
|
|
roomMap.set(roomId, {
|
|
roomId,
|
|
title,
|
|
});
|
|
}
|
|
}
|
|
|
|
return {
|
|
rooms: [...roomMap.values()],
|
|
hasFilter: keyword ? html.includes('class="filtered"') : true,
|
|
};
|
|
}
|
|
|
|
async function searchRooms(keyword) {
|
|
const rooms = [];
|
|
const seenRoomIds = new Set();
|
|
let pageFrom = 0;
|
|
let pageIndex = 1;
|
|
|
|
while (true) {
|
|
const html = await fetchText(buildSearchUrl(keyword, pageFrom));
|
|
const parsed = parseSearchResults(html, keyword);
|
|
|
|
if (pageIndex === 1 && !parsed.hasFilter) {
|
|
throw new Error('Search filter was not applied. The site may have changed its parameters.');
|
|
}
|
|
|
|
for (const room of parsed.rooms) {
|
|
if (seenRoomIds.has(room.roomId)) {
|
|
continue;
|
|
}
|
|
seenRoomIds.add(room.roomId);
|
|
rooms.push(room);
|
|
}
|
|
|
|
console.log(`search page ${pageIndex}: ${parsed.rooms.length} room(s)`);
|
|
|
|
if (parsed.rooms.length < SEARCH_PAGE_SIZE) {
|
|
break;
|
|
}
|
|
|
|
pageFrom += SEARCH_PAGE_SIZE;
|
|
pageIndex += 1;
|
|
await sleep(ANALYZE_DELAY_MS);
|
|
}
|
|
|
|
return rooms;
|
|
}
|
|
|
|
function parsePastLogPage(html, roomId) {
|
|
const dom = parser.parse(html);
|
|
const title = normalizePastLogTitle(decodeHtml(dom.querySelector('h3.ex')?.text.trim() || ''));
|
|
const entries = dom.querySelectorAll('ul > li > a[href*="/PastLog?"]');
|
|
|
|
return {
|
|
title,
|
|
entryCount: entries.length,
|
|
};
|
|
}
|
|
|
|
async function countPastLogPages(room) {
|
|
let pageFrom = 0;
|
|
let pages = 0;
|
|
let entries = 0;
|
|
let resolvedTitle = room.title;
|
|
|
|
while (true) {
|
|
const html = await fetchText(buildPastLogUrl(room.roomId, pageFrom));
|
|
const parsed = parsePastLogPage(html, room.roomId);
|
|
pages += 1;
|
|
entries += parsed.entryCount;
|
|
|
|
if (parsed.title) {
|
|
resolvedTitle = parsed.title;
|
|
}
|
|
|
|
if (parsed.entryCount < PAST_LOG_PAGE_SIZE) {
|
|
break;
|
|
}
|
|
|
|
pageFrom += PAST_LOG_PAGE_SIZE;
|
|
await sleep(ANALYZE_DELAY_MS);
|
|
}
|
|
|
|
return {
|
|
...room,
|
|
title: resolvedTitle,
|
|
pastLogPages: pages,
|
|
pastLogEntries: entries,
|
|
};
|
|
}
|
|
|
|
async function analyzeRooms(rooms) {
|
|
const analyzed = new Array(rooms.length);
|
|
let nextIndex = 0;
|
|
|
|
async function worker() {
|
|
while (true) {
|
|
const currentIndex = nextIndex;
|
|
nextIndex += 1;
|
|
|
|
if (currentIndex >= rooms.length) {
|
|
return;
|
|
}
|
|
|
|
const room = rooms[currentIndex];
|
|
console.log(`analyze ${currentIndex + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
|
|
const result = await countPastLogPages(room);
|
|
analyzed[currentIndex] = result;
|
|
console.log(` -> ${result.pastLogPages} page(s), ${result.pastLogEntries} log(s)`);
|
|
await sleep(ANALYZE_DELAY_MS);
|
|
}
|
|
}
|
|
|
|
await Promise.all(
|
|
Array.from({ length: Math.min(ANALYZE_CONCURRENCY, rooms.length) }, () => worker())
|
|
);
|
|
|
|
return analyzed;
|
|
}
|
|
|
|
function printTopRooms(rooms, topN) {
|
|
const topRooms = [...rooms]
|
|
.sort((left, right) => {
|
|
if (right.pastLogPages !== left.pastLogPages) {
|
|
return right.pastLogPages - left.pastLogPages;
|
|
}
|
|
if (right.pastLogEntries !== left.pastLogEntries) {
|
|
return right.pastLogEntries - left.pastLogEntries;
|
|
}
|
|
return left.roomId.localeCompare(right.roomId);
|
|
})
|
|
.slice(0, topN);
|
|
|
|
console.log('');
|
|
console.log(`Top ${topRooms.length} rooms by past log pages`);
|
|
for (const [index, room] of topRooms.entries()) {
|
|
console.log(
|
|
`${String(index + 1).padStart(3, ' ')}. [${room.roomId}] ${room.title} | ${room.pastLogPages} page(s) | ${room.pastLogEntries} log(s)`
|
|
);
|
|
}
|
|
}
|
|
|
|
function crawlRoom(roomId, fullMode) {
|
|
return new Promise(resolve => {
|
|
processRoom(roomId, resolve, fullMode);
|
|
});
|
|
}
|
|
|
|
async function crawlRoomsSequentially(rooms, fullMode) {
|
|
for (let index = 0; index < rooms.length; index += 1) {
|
|
const room = rooms[index];
|
|
console.log(`crawl ${index + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
|
|
await crawlRoom(room.roomId, fullMode);
|
|
if (index < rooms.length - 1) {
|
|
await sleep(CRAWL_ROOM_DELAY_MS);
|
|
}
|
|
}
|
|
}
|
|
|
|
function parseArgs(argv) {
|
|
const options = {
|
|
fullMode: false,
|
|
reportOnly: false,
|
|
top: 100,
|
|
};
|
|
|
|
const positional = [];
|
|
|
|
for (let index = 0; index < argv.length; index += 1) {
|
|
const value = argv[index];
|
|
if (value === '--full') {
|
|
options.fullMode = true;
|
|
continue;
|
|
}
|
|
if (value === '--report-only') {
|
|
options.reportOnly = true;
|
|
continue;
|
|
}
|
|
if (value === '--top') {
|
|
const topValue = Number(argv[index + 1]);
|
|
if (!Number.isInteger(topValue) || topValue <= 0) {
|
|
throw new Error('--top requires a positive integer');
|
|
}
|
|
options.top = topValue;
|
|
index += 1;
|
|
continue;
|
|
}
|
|
|
|
positional.push(value);
|
|
}
|
|
|
|
if (positional.length === 0) {
|
|
throw new Error('Usage: node search-rooms.js <keyword> [--top 100] [--report-only] [--full]');
|
|
}
|
|
|
|
options.keyword = positional.join(' ');
|
|
return options;
|
|
}
|
|
|
|
async function main() {
|
|
const options = parseArgs(process.argv.slice(2));
|
|
console.log(`search keyword: ${options.keyword}`);
|
|
|
|
const rooms = await searchRooms(options.keyword);
|
|
console.log(`matched rooms: ${rooms.length}`);
|
|
|
|
if (rooms.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const analyzedRooms = await analyzeRooms(rooms);
|
|
printTopRooms(analyzedRooms, options.top);
|
|
|
|
if (options.reportOnly) {
|
|
return;
|
|
}
|
|
|
|
console.log('');
|
|
console.log(`crawl start: ${analyzedRooms.length} room(s)`);
|
|
await crawlRoomsSequentially(analyzedRooms, options.fullMode);
|
|
console.log('crawl complete');
|
|
}
|
|
|
|
main().catch(error => {
|
|
console.error(error.message);
|
|
process.exitCode = 1;
|
|
}); |