Files
luvul_crawl/search-rooms.js
T
2026-05-18 18:04:16 +09:00

324 lines
9.0 KiB
JavaScript

const https = require('https');
const parser = require('node-html-parser');
const { processRoom } = require('./index');
const SEARCH_PAGE_SIZE = 20;
const PAST_LOG_PAGE_SIZE = 100;
const ANALYZE_DELAY_MS = 400;
const ANALYZE_CONCURRENCY = 6;
const CRAWL_ROOM_DELAY_MS = 5000;
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
function escapeRegExp(value) {
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
function decodeHtml(value) {
return value
.replace(/&/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
}
function normalizePastLogTitle(value) {
return value
.replace(/^「/, '')
.replace(/」の過去ログ一覧$/, '')
.trim();
}
function fetchText(targetUrl) {
return new Promise((resolve, reject) => {
https.get(targetUrl, response => {
if (response.statusCode !== 200) {
response.resume();
reject(new Error(`Request failed (${response.statusCode}) for ${targetUrl}`));
return;
}
response.setEncoding('utf8');
let rawData = '';
response.on('data', chunk => {
rawData += chunk;
});
response.on('end', () => resolve(rawData));
}).on('error', reject);
});
}
function buildSearchUrl(keyword, pageFrom) {
const params = new URLSearchParams({
action: 'Index',
search: keyword,
ie: 'UTF-8',
});
if (pageFrom > 0) {
params.set('pageFrom', String(pageFrom));
}
return `https://chat.luvul.net/?${params.toString()}#freerooms`;
}
function buildPastLogUrl(roomId, pageFrom) {
const params = new URLSearchParams({
action: 'PastLogList',
room_id: String(roomId),
pageFrom: String(pageFrom),
});
return `https://chat.luvul.net/?${params.toString()}`;
}
function parseSearchResults(html, keyword) {
const dom = parser.parse(html);
const roomMap = new Map();
for (const line of dom.querySelectorAll('div.freeroomlistline')) {
const link = line.querySelector('a[href*="/ChatRoom?room_id="]');
if (!link) {
continue;
}
const href = link.getAttribute('href') || '';
const roomIdMatch = href.match(/room_id=(\d+)/);
if (!roomIdMatch) {
continue;
}
const roomId = roomIdMatch[1];
const title = decodeHtml(link.text.trim());
if (!roomMap.has(roomId)) {
roomMap.set(roomId, {
roomId,
title,
});
}
}
return {
rooms: [...roomMap.values()],
hasFilter: keyword ? html.includes('class="filtered"') : true,
};
}
async function searchRooms(keyword) {
const rooms = [];
const seenRoomIds = new Set();
let pageFrom = 0;
let pageIndex = 1;
while (true) {
const html = await fetchText(buildSearchUrl(keyword, pageFrom));
const parsed = parseSearchResults(html, keyword);
if (pageIndex === 1 && !parsed.hasFilter) {
throw new Error('Search filter was not applied. The site may have changed its parameters.');
}
for (const room of parsed.rooms) {
if (seenRoomIds.has(room.roomId)) {
continue;
}
seenRoomIds.add(room.roomId);
rooms.push(room);
}
console.log(`search page ${pageIndex}: ${parsed.rooms.length} room(s)`);
if (parsed.rooms.length < SEARCH_PAGE_SIZE) {
break;
}
pageFrom += SEARCH_PAGE_SIZE;
pageIndex += 1;
await sleep(ANALYZE_DELAY_MS);
}
return rooms;
}
function parsePastLogPage(html, roomId) {
const dom = parser.parse(html);
const title = normalizePastLogTitle(decodeHtml(dom.querySelector('h3.ex')?.text.trim() || ''));
const entries = dom.querySelectorAll('ul > li > a[href*="/PastLog?"]');
return {
title,
entryCount: entries.length,
};
}
async function countPastLogPages(room) {
let pageFrom = 0;
let pages = 0;
let entries = 0;
let resolvedTitle = room.title;
while (true) {
const html = await fetchText(buildPastLogUrl(room.roomId, pageFrom));
const parsed = parsePastLogPage(html, room.roomId);
pages += 1;
entries += parsed.entryCount;
if (parsed.title) {
resolvedTitle = parsed.title;
}
if (parsed.entryCount < PAST_LOG_PAGE_SIZE) {
break;
}
pageFrom += PAST_LOG_PAGE_SIZE;
await sleep(ANALYZE_DELAY_MS);
}
return {
...room,
title: resolvedTitle,
pastLogPages: pages,
pastLogEntries: entries,
};
}
async function analyzeRooms(rooms) {
const analyzed = new Array(rooms.length);
let nextIndex = 0;
async function worker() {
while (true) {
const currentIndex = nextIndex;
nextIndex += 1;
if (currentIndex >= rooms.length) {
return;
}
const room = rooms[currentIndex];
console.log(`analyze ${currentIndex + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
const result = await countPastLogPages(room);
analyzed[currentIndex] = result;
console.log(` -> ${result.pastLogPages} page(s), ${result.pastLogEntries} log(s)`);
await sleep(ANALYZE_DELAY_MS);
}
}
await Promise.all(
Array.from({ length: Math.min(ANALYZE_CONCURRENCY, rooms.length) }, () => worker())
);
return analyzed;
}
function printTopRooms(rooms, topN) {
const topRooms = [...rooms]
.sort((left, right) => {
if (right.pastLogPages !== left.pastLogPages) {
return right.pastLogPages - left.pastLogPages;
}
if (right.pastLogEntries !== left.pastLogEntries) {
return right.pastLogEntries - left.pastLogEntries;
}
return left.roomId.localeCompare(right.roomId);
})
.slice(0, topN);
console.log('');
console.log(`Top ${topRooms.length} rooms by past log pages`);
for (const [index, room] of topRooms.entries()) {
console.log(
`${String(index + 1).padStart(3, ' ')}. [${room.roomId}] ${room.title} | ${room.pastLogPages} page(s) | ${room.pastLogEntries} log(s)`
);
}
}
function crawlRoom(roomId, fullMode) {
return new Promise(resolve => {
processRoom(roomId, resolve, fullMode);
});
}
async function crawlRoomsSequentially(rooms, fullMode) {
for (let index = 0; index < rooms.length; index += 1) {
const room = rooms[index];
console.log(`crawl ${index + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
await crawlRoom(room.roomId, fullMode);
if (index < rooms.length - 1) {
await sleep(CRAWL_ROOM_DELAY_MS);
}
}
}
function parseArgs(argv) {
const options = {
fullMode: false,
reportOnly: false,
top: 100,
};
const positional = [];
for (let index = 0; index < argv.length; index += 1) {
const value = argv[index];
if (value === '--full') {
options.fullMode = true;
continue;
}
if (value === '--report-only') {
options.reportOnly = true;
continue;
}
if (value === '--top') {
const topValue = Number(argv[index + 1]);
if (!Number.isInteger(topValue) || topValue <= 0) {
throw new Error('--top requires a positive integer');
}
options.top = topValue;
index += 1;
continue;
}
positional.push(value);
}
if (positional.length === 0) {
throw new Error('Usage: node search-rooms.js <keyword> [--top 100] [--report-only] [--full]');
}
options.keyword = positional.join(' ');
return options;
}
async function main() {
const options = parseArgs(process.argv.slice(2));
console.log(`search keyword: ${options.keyword}`);
const rooms = await searchRooms(options.keyword);
console.log(`matched rooms: ${rooms.length}`);
if (rooms.length === 0) {
return;
}
const analyzedRooms = await analyzeRooms(rooms);
printTopRooms(analyzedRooms, options.top);
if (options.reportOnly) {
return;
}
console.log('');
console.log(`crawl start: ${analyzedRooms.length} room(s)`);
await crawlRoomsSequentially(analyzedRooms, options.fullMode);
console.log('crawl complete');
}
main().catch(error => {
console.error(error.message);
process.exitCode = 1;
});