""" Sex Chat Detection Web App using User-Word Bipartite Graph Author: Ramaguru Radhakrishnan Date: 16th October 2025 Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory, compute metrics, and visualize the user-word graph efficiently without hitting storage limits. """ import networkx as nx import matplotlib.pyplot as plt import re import json import gradio as gr import pandas as pd from io import BytesIO # ------------------------------- # Load Lexicon from JSON # ------------------------------- with open("data.json") as f: lexicon_data = json.load(f) LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data} SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3} # ------------------------------- # Helper Functions # ------------------------------- def tokenize(text): """Tokenize input text into lowercase words.""" return re.findall(r'\b[a-zA-Z]+\b', str(text).lower()) def compute_URI(chats): """Compute User Risk Index and Bipartite Graph from chat data.""" B = nx.Graph() user_nodes = set() word_nodes = set() for sender, receiver, message in chats: words = tokenize(message) for user in [sender, receiver]: user_nodes.add(user) for word in words: word_nodes.add(word) weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1) if B.has_edge(user, word): B[user][word]["weight"] += weight else: B.add_edge(user, word, weight=weight) # Compute URI user_risk = {} for user in user_nodes: edges = B.edges(user, data=True) if not edges: continue total_weight = sum(d["weight"] for _, _, d in edges) connected_words = len(edges) uri = total_weight * (connected_words / max(1, len(word_nodes))) user_risk[user] = round(uri, 2) # Conversation-level risk conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0 return user_risk, conv_risk, B def get_top_words(user_risk, B, top_n=5): """Return top contributing words per user.""" top_words = {} for user in user_risk: edges = B.edges(user, data=True) sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True) words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]] top_words[user] = words return top_words def plot_graph(B, user_risk): """Return matplotlib figure of user-word bipartite graph and close to free memory.""" fig = plt.figure(figsize=(10, 8)) pos = nx.spring_layout(B, k=0.8) node_colors = [] for n in B.nodes(): if n in user_risk: risk = user_risk.get(n, 0) alpha = min(0.2 + risk / 50, 1) node_colors.append((1, 0, 0, alpha)) else: node_colors.append('lightblue') nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800, font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()]) plt.title("User-Word Bipartite Graph with Risk Heatmap") fig_canvas = fig plt.close(fig) # release memory return fig_canvas # ------------------------------- # Gradio Interface Function # ------------------------------- def analyze_chat(file_bytes): """ Upload CSV or JSON chat file with columns: sender, receiver, message. Processes entirely in memory to avoid storage issues. """ try: # Manual file size check (10 MB max) if len(file_bytes) > 10_000_000: return "File too large. Max size is 10 MB.", None # Try CSV first, fallback to JSON try: df = pd.read_csv(BytesIO(file_bytes)) except: try: df = pd.read_json(BytesIO(file_bytes)) except: return "Unsupported file format. Please upload CSV or JSON.", None if not all(col in df.columns for col in ["sender", "receiver", "message"]): return "File must contain columns: sender, receiver, message", None chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None)) user_risk, conv_risk, B = compute_URI(chats) top_words = get_top_words(user_risk, B) status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected" # Prepare metrics text metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n" for user, uri in user_risk.items(): metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n" fig = plot_graph(B, user_risk) return metrics_text, fig except Exception as e: return f"Error processing file: {e}", None # ------------------------------- # Gradio App Launch # ------------------------------- with gr.Blocks(title="Sex Chat Detection") as iface: with gr.Row(): with gr.Column(scale=1): file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary') metrics_output = gr.Textbox(label="Metrics", lines=15) with gr.Column(scale=1): graph_output = gr.Plot(label="Risk Graph") # Connect inputs and outputs file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output]) iface.launch()