const https = require('https'); const parser = require('node-html-parser'); const { processRoom } = require('./index'); const SEARCH_PAGE_SIZE = 20; const PAST_LOG_PAGE_SIZE = 100; const ANALYZE_DELAY_MS = 400; const ANALYZE_CONCURRENCY = 6; const CRAWL_ROOM_DELAY_MS = 5000; function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } function escapeRegExp(value) { return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } function decodeHtml(value) { return value .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'"); } function normalizePastLogTitle(value) { return value .replace(/^「/, '') .replace(/」の過去ログ一覧$/, '') .trim(); } function fetchText(targetUrl) { return new Promise((resolve, reject) => { https.get(targetUrl, response => { if (response.statusCode !== 200) { response.resume(); reject(new Error(`Request failed (${response.statusCode}) for ${targetUrl}`)); return; } response.setEncoding('utf8'); let rawData = ''; response.on('data', chunk => { rawData += chunk; }); response.on('end', () => resolve(rawData)); }).on('error', reject); }); } function buildSearchUrl(keyword, pageFrom) { const params = new URLSearchParams({ action: 'Index', search: keyword, ie: 'UTF-8', }); if (pageFrom > 0) { params.set('pageFrom', String(pageFrom)); } return `https://chat.luvul.net/?${params.toString()}#freerooms`; } function buildPastLogUrl(roomId, pageFrom) { const params = new URLSearchParams({ action: 'PastLogList', room_id: String(roomId), pageFrom: String(pageFrom), }); return `https://chat.luvul.net/?${params.toString()}`; } function parseSearchResults(html, keyword) { const dom = parser.parse(html); const roomMap = new Map(); for (const line of dom.querySelectorAll('div.freeroomlistline')) { const link = line.querySelector('a[href*="/ChatRoom?room_id="]'); if (!link) { continue; } const href = link.getAttribute('href') || ''; const roomIdMatch = href.match(/room_id=(\d+)/); if (!roomIdMatch) { continue; } const roomId = roomIdMatch[1]; const title = decodeHtml(link.text.trim()); if (!roomMap.has(roomId)) { roomMap.set(roomId, { roomId, title, }); } } return { rooms: [...roomMap.values()], hasFilter: keyword ? html.includes('class="filtered"') : true, }; } async function searchRooms(keyword) { const rooms = []; const seenRoomIds = new Set(); let pageFrom = 0; let pageIndex = 1; while (true) { const html = await fetchText(buildSearchUrl(keyword, pageFrom)); const parsed = parseSearchResults(html, keyword); if (pageIndex === 1 && !parsed.hasFilter) { throw new Error('Search filter was not applied. The site may have changed its parameters.'); } for (const room of parsed.rooms) { if (seenRoomIds.has(room.roomId)) { continue; } seenRoomIds.add(room.roomId); rooms.push(room); } console.log(`search page ${pageIndex}: ${parsed.rooms.length} room(s)`); if (parsed.rooms.length < SEARCH_PAGE_SIZE) { break; } pageFrom += SEARCH_PAGE_SIZE; pageIndex += 1; await sleep(ANALYZE_DELAY_MS); } return rooms; } function parsePastLogPage(html, roomId) { const dom = parser.parse(html); const title = normalizePastLogTitle(decodeHtml(dom.querySelector('h3.ex')?.text.trim() || '')); const entries = dom.querySelectorAll('ul > li > a[href*="/PastLog?"]'); return { title, entryCount: entries.length, }; } async function countPastLogPages(room) { let pageFrom = 0; let pages = 0; let entries = 0; let resolvedTitle = room.title; while (true) { const html = await fetchText(buildPastLogUrl(room.roomId, pageFrom)); const parsed = parsePastLogPage(html, room.roomId); pages += 1; entries += parsed.entryCount; if (parsed.title) { resolvedTitle = parsed.title; } if (parsed.entryCount < PAST_LOG_PAGE_SIZE) { break; } pageFrom += PAST_LOG_PAGE_SIZE; await sleep(ANALYZE_DELAY_MS); } return { ...room, title: resolvedTitle, pastLogPages: pages, pastLogEntries: entries, }; } async function analyzeRooms(rooms) { const analyzed = new Array(rooms.length); let nextIndex = 0; async function worker() { while (true) { const currentIndex = nextIndex; nextIndex += 1; if (currentIndex >= rooms.length) { return; } const room = rooms[currentIndex]; console.log(`analyze ${currentIndex + 1}/${rooms.length}: ${room.roomId} ${room.title}`); const result = await countPastLogPages(room); analyzed[currentIndex] = result; console.log(` -> ${result.pastLogPages} page(s), ${result.pastLogEntries} log(s)`); await sleep(ANALYZE_DELAY_MS); } } await Promise.all( Array.from({ length: Math.min(ANALYZE_CONCURRENCY, rooms.length) }, () => worker()) ); return analyzed; } function printTopRooms(rooms, topN) { const topRooms = [...rooms] .sort((left, right) => { if (right.pastLogPages !== left.pastLogPages) { return right.pastLogPages - left.pastLogPages; } if (right.pastLogEntries !== left.pastLogEntries) { return right.pastLogEntries - left.pastLogEntries; } return left.roomId.localeCompare(right.roomId); }) .slice(0, topN); console.log(''); console.log(`Top ${topRooms.length} rooms by past log pages`); for (const [index, room] of topRooms.entries()) { console.log( `${String(index + 1).padStart(3, ' ')}. [${room.roomId}] ${room.title} | ${room.pastLogPages} page(s) | ${room.pastLogEntries} log(s)` ); } } function crawlRoom(roomId, fullMode) { return new Promise(resolve => { processRoom(roomId, resolve, fullMode); }); } async function crawlRoomsSequentially(rooms, fullMode) { for (let index = 0; index < rooms.length; index += 1) { const room = rooms[index]; console.log(`crawl ${index + 1}/${rooms.length}: ${room.roomId} ${room.title}`); await crawlRoom(room.roomId, fullMode); if (index < rooms.length - 1) { await sleep(CRAWL_ROOM_DELAY_MS); } } } function parseArgs(argv) { const options = { fullMode: false, reportOnly: false, top: 100, }; const positional = []; for (let index = 0; index < argv.length; index += 1) { const value = argv[index]; if (value === '--full') { options.fullMode = true; continue; } if (value === '--report-only') { options.reportOnly = true; continue; } if (value === '--top') { const topValue = Number(argv[index + 1]); if (!Number.isInteger(topValue) || topValue <= 0) { throw new Error('--top requires a positive integer'); } options.top = topValue; index += 1; continue; } positional.push(value); } if (positional.length === 0) { throw new Error('Usage: node search-rooms.js [--top 100] [--report-only] [--full]'); } options.keyword = positional.join(' '); return options; } async function main() { const options = parseArgs(process.argv.slice(2)); console.log(`search keyword: ${options.keyword}`); const rooms = await searchRooms(options.keyword); console.log(`matched rooms: ${rooms.length}`); if (rooms.length === 0) { return; } const analyzedRooms = await analyzeRooms(rooms); printTopRooms(analyzedRooms, options.top); if (options.reportOnly) { return; } console.log(''); console.log(`crawl start: ${analyzedRooms.length} room(s)`); await crawlRoomsSequentially(analyzedRooms, options.fullMode); console.log('crawl complete'); } main().catch(error => { console.error(error.message); process.exitCode = 1; });