update
This commit is contained in:
+324
@@ -0,0 +1,324 @@
|
||||
const https = require('https');
|
||||
const parser = require('node-html-parser');
|
||||
const { processRoom } = require('./index');
|
||||
|
||||
const SEARCH_PAGE_SIZE = 20;
|
||||
const PAST_LOG_PAGE_SIZE = 100;
|
||||
const ANALYZE_DELAY_MS = 400;
|
||||
const ANALYZE_CONCURRENCY = 6;
|
||||
const CRAWL_ROOM_DELAY_MS = 5000;
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function escapeRegExp(value) {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
function decodeHtml(value) {
|
||||
return value
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
function normalizePastLogTitle(value) {
|
||||
return value
|
||||
.replace(/^「/, '')
|
||||
.replace(/」の過去ログ一覧$/, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function fetchText(targetUrl) {
|
||||
return new Promise((resolve, reject) => {
|
||||
https.get(targetUrl, response => {
|
||||
if (response.statusCode !== 200) {
|
||||
response.resume();
|
||||
reject(new Error(`Request failed (${response.statusCode}) for ${targetUrl}`));
|
||||
return;
|
||||
}
|
||||
|
||||
response.setEncoding('utf8');
|
||||
let rawData = '';
|
||||
response.on('data', chunk => {
|
||||
rawData += chunk;
|
||||
});
|
||||
response.on('end', () => resolve(rawData));
|
||||
}).on('error', reject);
|
||||
});
|
||||
}
|
||||
|
||||
function buildSearchUrl(keyword, pageFrom) {
|
||||
const params = new URLSearchParams({
|
||||
action: 'Index',
|
||||
search: keyword,
|
||||
ie: 'UTF-8',
|
||||
});
|
||||
|
||||
if (pageFrom > 0) {
|
||||
params.set('pageFrom', String(pageFrom));
|
||||
}
|
||||
|
||||
return `https://chat.luvul.net/?${params.toString()}#freerooms`;
|
||||
}
|
||||
|
||||
function buildPastLogUrl(roomId, pageFrom) {
|
||||
const params = new URLSearchParams({
|
||||
action: 'PastLogList',
|
||||
room_id: String(roomId),
|
||||
pageFrom: String(pageFrom),
|
||||
});
|
||||
return `https://chat.luvul.net/?${params.toString()}`;
|
||||
}
|
||||
|
||||
function parseSearchResults(html, keyword) {
|
||||
const dom = parser.parse(html);
|
||||
const roomMap = new Map();
|
||||
|
||||
for (const line of dom.querySelectorAll('div.freeroomlistline')) {
|
||||
const link = line.querySelector('a[href*="/ChatRoom?room_id="]');
|
||||
if (!link) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const href = link.getAttribute('href') || '';
|
||||
const roomIdMatch = href.match(/room_id=(\d+)/);
|
||||
if (!roomIdMatch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const roomId = roomIdMatch[1];
|
||||
const title = decodeHtml(link.text.trim());
|
||||
|
||||
if (!roomMap.has(roomId)) {
|
||||
roomMap.set(roomId, {
|
||||
roomId,
|
||||
title,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
rooms: [...roomMap.values()],
|
||||
hasFilter: keyword ? html.includes('class="filtered"') : true,
|
||||
};
|
||||
}
|
||||
|
||||
async function searchRooms(keyword) {
|
||||
const rooms = [];
|
||||
const seenRoomIds = new Set();
|
||||
let pageFrom = 0;
|
||||
let pageIndex = 1;
|
||||
|
||||
while (true) {
|
||||
const html = await fetchText(buildSearchUrl(keyword, pageFrom));
|
||||
const parsed = parseSearchResults(html, keyword);
|
||||
|
||||
if (pageIndex === 1 && !parsed.hasFilter) {
|
||||
throw new Error('Search filter was not applied. The site may have changed its parameters.');
|
||||
}
|
||||
|
||||
for (const room of parsed.rooms) {
|
||||
if (seenRoomIds.has(room.roomId)) {
|
||||
continue;
|
||||
}
|
||||
seenRoomIds.add(room.roomId);
|
||||
rooms.push(room);
|
||||
}
|
||||
|
||||
console.log(`search page ${pageIndex}: ${parsed.rooms.length} room(s)`);
|
||||
|
||||
if (parsed.rooms.length < SEARCH_PAGE_SIZE) {
|
||||
break;
|
||||
}
|
||||
|
||||
pageFrom += SEARCH_PAGE_SIZE;
|
||||
pageIndex += 1;
|
||||
await sleep(ANALYZE_DELAY_MS);
|
||||
}
|
||||
|
||||
return rooms;
|
||||
}
|
||||
|
||||
function parsePastLogPage(html, roomId) {
|
||||
const dom = parser.parse(html);
|
||||
const title = normalizePastLogTitle(decodeHtml(dom.querySelector('h3.ex')?.text.trim() || ''));
|
||||
const entries = dom.querySelectorAll('ul > li > a[href*="/PastLog?"]');
|
||||
|
||||
return {
|
||||
title,
|
||||
entryCount: entries.length,
|
||||
};
|
||||
}
|
||||
|
||||
async function countPastLogPages(room) {
|
||||
let pageFrom = 0;
|
||||
let pages = 0;
|
||||
let entries = 0;
|
||||
let resolvedTitle = room.title;
|
||||
|
||||
while (true) {
|
||||
const html = await fetchText(buildPastLogUrl(room.roomId, pageFrom));
|
||||
const parsed = parsePastLogPage(html, room.roomId);
|
||||
pages += 1;
|
||||
entries += parsed.entryCount;
|
||||
|
||||
if (parsed.title) {
|
||||
resolvedTitle = parsed.title;
|
||||
}
|
||||
|
||||
if (parsed.entryCount < PAST_LOG_PAGE_SIZE) {
|
||||
break;
|
||||
}
|
||||
|
||||
pageFrom += PAST_LOG_PAGE_SIZE;
|
||||
await sleep(ANALYZE_DELAY_MS);
|
||||
}
|
||||
|
||||
return {
|
||||
...room,
|
||||
title: resolvedTitle,
|
||||
pastLogPages: pages,
|
||||
pastLogEntries: entries,
|
||||
};
|
||||
}
|
||||
|
||||
async function analyzeRooms(rooms) {
|
||||
const analyzed = new Array(rooms.length);
|
||||
let nextIndex = 0;
|
||||
|
||||
async function worker() {
|
||||
while (true) {
|
||||
const currentIndex = nextIndex;
|
||||
nextIndex += 1;
|
||||
|
||||
if (currentIndex >= rooms.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const room = rooms[currentIndex];
|
||||
console.log(`analyze ${currentIndex + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
|
||||
const result = await countPastLogPages(room);
|
||||
analyzed[currentIndex] = result;
|
||||
console.log(` -> ${result.pastLogPages} page(s), ${result.pastLogEntries} log(s)`);
|
||||
await sleep(ANALYZE_DELAY_MS);
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(
|
||||
Array.from({ length: Math.min(ANALYZE_CONCURRENCY, rooms.length) }, () => worker())
|
||||
);
|
||||
|
||||
return analyzed;
|
||||
}
|
||||
|
||||
function printTopRooms(rooms, topN) {
|
||||
const topRooms = [...rooms]
|
||||
.sort((left, right) => {
|
||||
if (right.pastLogPages !== left.pastLogPages) {
|
||||
return right.pastLogPages - left.pastLogPages;
|
||||
}
|
||||
if (right.pastLogEntries !== left.pastLogEntries) {
|
||||
return right.pastLogEntries - left.pastLogEntries;
|
||||
}
|
||||
return left.roomId.localeCompare(right.roomId);
|
||||
})
|
||||
.slice(0, topN);
|
||||
|
||||
console.log('');
|
||||
console.log(`Top ${topRooms.length} rooms by past log pages`);
|
||||
for (const [index, room] of topRooms.entries()) {
|
||||
console.log(
|
||||
`${String(index + 1).padStart(3, ' ')}. [${room.roomId}] ${room.title} | ${room.pastLogPages} page(s) | ${room.pastLogEntries} log(s)`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function crawlRoom(roomId, fullMode) {
|
||||
return new Promise(resolve => {
|
||||
processRoom(roomId, resolve, fullMode);
|
||||
});
|
||||
}
|
||||
|
||||
async function crawlRoomsSequentially(rooms, fullMode) {
|
||||
for (let index = 0; index < rooms.length; index += 1) {
|
||||
const room = rooms[index];
|
||||
console.log(`crawl ${index + 1}/${rooms.length}: ${room.roomId} ${room.title}`);
|
||||
await crawlRoom(room.roomId, fullMode);
|
||||
if (index < rooms.length - 1) {
|
||||
await sleep(CRAWL_ROOM_DELAY_MS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function parseArgs(argv) {
|
||||
const options = {
|
||||
fullMode: false,
|
||||
reportOnly: false,
|
||||
top: 100,
|
||||
};
|
||||
|
||||
const positional = [];
|
||||
|
||||
for (let index = 0; index < argv.length; index += 1) {
|
||||
const value = argv[index];
|
||||
if (value === '--full') {
|
||||
options.fullMode = true;
|
||||
continue;
|
||||
}
|
||||
if (value === '--report-only') {
|
||||
options.reportOnly = true;
|
||||
continue;
|
||||
}
|
||||
if (value === '--top') {
|
||||
const topValue = Number(argv[index + 1]);
|
||||
if (!Number.isInteger(topValue) || topValue <= 0) {
|
||||
throw new Error('--top requires a positive integer');
|
||||
}
|
||||
options.top = topValue;
|
||||
index += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
positional.push(value);
|
||||
}
|
||||
|
||||
if (positional.length === 0) {
|
||||
throw new Error('Usage: node search-rooms.js <keyword> [--top 100] [--report-only] [--full]');
|
||||
}
|
||||
|
||||
options.keyword = positional.join(' ');
|
||||
return options;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const options = parseArgs(process.argv.slice(2));
|
||||
console.log(`search keyword: ${options.keyword}`);
|
||||
|
||||
const rooms = await searchRooms(options.keyword);
|
||||
console.log(`matched rooms: ${rooms.length}`);
|
||||
|
||||
if (rooms.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const analyzedRooms = await analyzeRooms(rooms);
|
||||
printTopRooms(analyzedRooms, options.top);
|
||||
|
||||
if (options.reportOnly) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log(`crawl start: ${analyzedRooms.length} room(s)`);
|
||||
await crawlRoomsSequentially(analyzedRooms, options.fullMode);
|
||||
console.log('crawl complete');
|
||||
}
|
||||
|
||||
main().catch(error => {
|
||||
console.error(error.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
Reference in New Issue
Block a user