| """ |
| Sex Chat Detection Web App using User-Word Bipartite Graph |
| Author: Ramaguru Radhakrishnan |
| Date: 16th October 2025 |
| |
| Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory, |
| compute metrics, and visualize the user-word graph efficiently without hitting storage limits. |
| """ |
|
|
| import networkx as nx |
| import matplotlib.pyplot as plt |
| import re |
| import json |
| import gradio as gr |
| import pandas as pd |
| from io import BytesIO |
|
|
| |
| |
| |
| with open("data.json") as f: |
| lexicon_data = json.load(f) |
|
|
| LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data} |
| SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3} |
|
|
| |
| |
| |
| def tokenize(text): |
| """Tokenize input text into lowercase words.""" |
| return re.findall(r'\b[a-zA-Z]+\b', str(text).lower()) |
|
|
| def compute_URI(chats): |
| """Compute User Risk Index and Bipartite Graph from chat data.""" |
| B = nx.Graph() |
| user_nodes = set() |
| word_nodes = set() |
| |
| for sender, receiver, message in chats: |
| words = tokenize(message) |
| for user in [sender, receiver]: |
| user_nodes.add(user) |
| for word in words: |
| word_nodes.add(word) |
| weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1) |
| if B.has_edge(user, word): |
| B[user][word]["weight"] += weight |
| else: |
| B.add_edge(user, word, weight=weight) |
| |
| |
| user_risk = {} |
| for user in user_nodes: |
| edges = B.edges(user, data=True) |
| if not edges: |
| continue |
| total_weight = sum(d["weight"] for _, _, d in edges) |
| connected_words = len(edges) |
| uri = total_weight * (connected_words / max(1, len(word_nodes))) |
| user_risk[user] = round(uri, 2) |
| |
| |
| conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0 |
| return user_risk, conv_risk, B |
|
|
| def get_top_words(user_risk, B, top_n=5): |
| """Return top contributing words per user.""" |
| top_words = {} |
| for user in user_risk: |
| edges = B.edges(user, data=True) |
| sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True) |
| words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]] |
| top_words[user] = words |
| return top_words |
|
|
| def plot_graph(B, user_risk): |
| """Return matplotlib figure of user-word bipartite graph and close to free memory.""" |
| fig = plt.figure(figsize=(10, 8)) |
| pos = nx.spring_layout(B, k=0.8) |
| |
| node_colors = [] |
| for n in B.nodes(): |
| if n in user_risk: |
| risk = user_risk.get(n, 0) |
| alpha = min(0.2 + risk / 50, 1) |
| node_colors.append((1, 0, 0, alpha)) |
| else: |
| node_colors.append('lightblue') |
| |
| nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800, |
| font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()]) |
| plt.title("User-Word Bipartite Graph with Risk Heatmap") |
| |
| fig_canvas = fig |
| plt.close(fig) |
| return fig_canvas |
|
|
| |
| |
| |
| def analyze_chat(file_bytes): |
| """ |
| Upload CSV or JSON chat file with columns: sender, receiver, message. |
| Processes entirely in memory to avoid storage issues. |
| """ |
| try: |
| |
| if len(file_bytes) > 10_000_000: |
| return "File too large. Max size is 10 MB.", None |
| |
| |
| try: |
| df = pd.read_csv(BytesIO(file_bytes)) |
| except: |
| try: |
| df = pd.read_json(BytesIO(file_bytes)) |
| except: |
| return "Unsupported file format. Please upload CSV or JSON.", None |
| |
| if not all(col in df.columns for col in ["sender", "receiver", "message"]): |
| return "File must contain columns: sender, receiver, message", None |
| |
| chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None)) |
| user_risk, conv_risk, B = compute_URI(chats) |
| top_words = get_top_words(user_risk, B) |
| status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected" |
| |
| |
| metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n" |
| for user, uri in user_risk.items(): |
| metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n" |
| |
| fig = plot_graph(B, user_risk) |
| return metrics_text, fig |
| |
| except Exception as e: |
| return f"Error processing file: {e}", None |
|
|
| |
| |
| |
| with gr.Blocks(title="Sex Chat Detection") as iface: |
| with gr.Row(): |
| with gr.Column(scale=1): |
| file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary') |
| metrics_output = gr.Textbox(label="Metrics", lines=15) |
| with gr.Column(scale=1): |
| graph_output = gr.Plot(label="Risk Graph") |
|
|
| |
| file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output]) |
|
|
| iface.launch() |
|
|
|
|