analyze / src /lib /parser.ts
wuhp's picture
Upload 14 files
d614256 verified
export interface UserNode {
displayName: string;
username: string;
}
export interface ParseResult {
relationships: Map<string, { following: UserNode[], followers: UserNode[] }>;
allUsers: Map<string, UserNode>;
logs: string[];
}
export function parseSocialData(text: string): ParseResult {
const result: ParseResult = {
relationships: new Map(),
allUsers: new Map(),
logs: [],
};
const addLog = (msg: string) => {
result.logs.push(`[${new Date().toLocaleTimeString()}] ${msg}`);
};
// 1. Clean text
let cleaned = text.replace(/[\u200B-\u200D\uFEFF]/g, '');
cleaned = cleaned.replace(/\.{3,}/g, '\n');
// Make sure headers are on their own lines
cleaned = cleaned.replace(/(Following|Followers)(\d+)/gi, '\n$1\n$2\n');
const rawLines = cleaned.split('\n').map(l => l.trim()).filter(l => l.length > 0);
addLog(`Started parsing. Extracted ${rawLines.length} raw lines after cleanup.`);
let currentSubject: string | null = null;
let currentContext: 'following' | 'followers' | null = null;
let lastParsedNode: UserNode | null = null;
let i = 0;
while (i < rawLines.length) {
const line = rawLines[i];
// Check if header
if (/^(Following|Followers)$/i.test(line)) {
const isFollowing = line.toLowerCase() === 'following';
const newContext = isFollowing ? 'following' : 'followers';
if (i + 1 < rawLines.length && /^\d+$/.test(rawLines[i+1])) {
i++; // Skip the count line
}
if (lastParsedNode) {
if (currentSubject !== lastParsedNode.username) {
currentSubject = lastParsedNode.username;
if (!result.relationships.has(currentSubject)) {
result.relationships.set(currentSubject, { following: [], followers: [] });
}
addLog(`New graph center identified: ${currentSubject} (Tracking ${newContext})`);
// Remove the subject from the previous context's relationships
for (const rels of result.relationships.values()) {
if (rels.following.length > 0 && rels.following[rels.following.length - 1].username === lastParsedNode.username) {
rels.following.pop();
}
if (rels.followers.length > 0 && rels.followers[rels.followers.length - 1].username === lastParsedNode.username) {
rels.followers.pop();
}
}
} else {
addLog(`Switched context to: ${newContext} for ${currentSubject}`);
}
}
currentContext = newContext;
i++;
continue;
}
// Ignore stray numbers
if (/^\d+$/.test(line)) {
i++;
continue;
}
let displayName = "";
let username = "";
const nextLine = (i + 1 < rawLines.length) ? rawLines[i+1] : null;
const nextIsHeaderOrNumber = nextLine && (/^(Following|Followers)$/i.test(nextLine) || /^\d+$/.test(nextLine));
const isUsername = (str: string) => /^@?[a-z0-9._]+$/i.test(str);
// 2-line structure heuristic
if (nextLine && !nextIsHeaderOrNumber && isUsername(nextLine)) {
displayName = line;
username = nextLine;
i += 2;
} else {
// 1-line structure (glued or standalone)
const match = line.match(/^(.*?)(@?[a-z0-9._]+)$/i);
if (match && match[2].length > 0) {
displayName = match[1].trim();
username = match[2];
if (!displayName) displayName = username;
} else {
displayName = line;
username = line.replace(/\s+/g, '').toLowerCase();
}
i++;
}
username = username.replace(/^@/, '');
// Filter purely numeric edge cases or generic bots
if (/^\d+$/.test(username) || /^user\d+$/i.test(username)) {
continue;
}
const node: UserNode = { displayName, username };
result.allUsers.set(username, node);
lastParsedNode = node;
if (currentSubject && currentContext) {
const rels = result.relationships.get(currentSubject);
if (rels) {
rels[currentContext].push(node);
}
}
}
return result;
}
export interface MemberMetrics {
username: string;
inDegree: number;
outDegree: number;
mutuals: number;
score: number;
isAltCandidate?: boolean;
altOf?: string;
primaryCenter?: string;
}
export function findShortestPath(data: ParseResult, start: string, end: string): string[] | null {
if (start === end) return [start];
// Adjacency list from all relationships
const adj = new Map<string, Set<string>>();
const addEdge = (u: string, v: string) => {
if (!adj.has(u)) adj.set(u, new Set());
adj.get(u)!.add(v);
};
data.relationships.forEach((rels, subject) => {
rels.following.forEach(u => addEdge(subject, u.username));
rels.followers.forEach(u => addEdge(u.username, subject));
});
const queue: [string, string[]][] = [[start, [start]]];
const visited = new Set<string>([start]);
while (queue.length > 0) {
const [node, path] = queue.shift()!;
if (node === end) return path;
const neighbors = adj.get(node) || new Set();
for (const neighbor of neighbors) {
if (!visited.has(neighbor)) {
visited.add(neighbor);
queue.push([neighbor, [...path, neighbor]]);
}
}
}
return null;
}
export function getCoreMembers(data: ParseResult): MemberMetrics[] {
const metrics = new Map<string, MemberMetrics>();
const getOrCreate = (username: string) => {
if (!metrics.has(username)) {
metrics.set(username, { username, inDegree: 0, outDegree: 0, mutuals: 0, score: 0 });
}
return metrics.get(username)!;
};
// Build a set of connections for quick mutual lookup
const outwardEdges = new Map<string, Set<string>>();
for (const [subject, rels] of data.relationships.entries()) {
if (!outwardEdges.has(subject)) outwardEdges.set(subject, new Set());
const subjectOut = outwardEdges.get(subject)!;
for (const u of rels.following) {
subjectOut.add(u.username);
}
}
const centerRep = new Map<string, Map<string, number>>();
// Calculate degrees and center reps
for (const [subject, rels] of data.relationships.entries()) {
const subjNode = getOrCreate(subject);
for (const u of rels.following) {
subjNode.outDegree++;
const targetNode = getOrCreate(u.username);
targetNode.inDegree++;
if (!centerRep.has(u.username)) centerRep.set(u.username, new Map());
centerRep.get(u.username)!.set(subject, (centerRep.get(u.username)!.get(subject) || 0) + 1);
}
for (const u of rels.followers) {
subjNode.inDegree++;
const sourceNode = getOrCreate(u.username);
sourceNode.outDegree++;
if (!centerRep.has(u.username)) centerRep.set(u.username, new Map());
centerRep.get(u.username)!.set(subject, (centerRep.get(u.username)!.get(subject) || 0) + 1);
if (outwardEdges.get(subject)?.has(u.username)) {
subjNode.mutuals++;
sourceNode.mutuals++;
}
}
}
// Calculate a simplified eigenvector-like centrality score
// Degree + (mutuals * 3)
for (const m of metrics.values()) {
m.score = m.inDegree + m.outDegree + (m.mutuals * 3);
// Assign primary center
if (centerRep.has(m.username)) {
const reps = Array.from(centerRep.get(m.username)!.entries());
reps.sort((a, b) => b[1] - a[1]);
if (reps.length > 0) m.primaryCenter = reps[0][0];
} else if (data.relationships.has(m.username)) {
m.primaryCenter = m.username;
}
}
const sorted = Array.from(metrics.values()).sort((a, b) => b.score - a.score);
// Advanced Algorithm: Alt Detection Heuristic
for (let i = 0; i < sorted.length; i++) {
for (let j = i + 1; j < sorted.length; j++) {
const u1 = sorted[i].username.toLowerCase();
const u2 = sorted[j].username.toLowerCase();
if (u1.length < 3 || u2.length < 3) continue;
// Substring match or common prefix/suffix
const isSimilar = u1.includes(u2) || u2.includes(u1) ||
(u1.slice(0, 5) === u2.slice(0, 5) && Math.abs(u1.length - u2.length) < 3);
if (isSimilar) {
// High mutual connection similarity also helps
const s1 = outwardEdges.get(u1) || new Set();
const s2 = outwardEdges.get(u2) || new Set();
let intersection = 0;
s1.forEach(x => { if (s2.has(x)) intersection++; });
const union = s1.size + s2.size - intersection;
const jaccard = union > 0 ? intersection / union : 0;
if (jaccard > 0.3 || (isSimilar && (s1.size < 5 || s2.size < 5))) {
if (!sorted[j].isAltCandidate) {
sorted[j].isAltCandidate = true;
sorted[j].altOf = u1;
}
}
}
}
}
return sorted;
}
export interface ClusterInfo {
center: string;
members: string[];
color: string;
}
export function detectClusters(data: ParseResult, metrics: MemberMetrics[]): ClusterInfo[] {
const centers = Array.from(data.relationships.keys());
const clusters = new Map<string, string[]>();
centers.forEach(c => clusters.set(c, [c]));
metrics.forEach(m => {
if (m.primaryCenter && m.primaryCenter !== m.username) {
if (clusters.has(m.primaryCenter)) {
clusters.get(m.primaryCenter)!.push(m.username);
}
}
});
const colors = ["#6366f1", "#0ea5e9", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6", "#ec4899", "#14b8a6", "#f97316"];
return Array.from(clusters.entries())
.map(([center, members], idx) => ({
center,
members,
color: colors[idx % colors.length]
}))
.filter(c => c.members.length > 2)
.sort((a, b) => b.members.length - a.members.length);
}