File size: 5,488 Bytes
d9afd4a d629c53 d9afd4a d629c53 d9afd4a d629c53 d9afd4a d629c53 d9afd4a d629c53 d9afd4a d629c53 d9afd4a d629c53 d9afd4a 7a7dd94 d9afd4a d629c53 d9afd4a df40255 7a7dd94 df40255 7a7dd94 d629c53 d9afd4a d629c53 d9afd4a a124cb4 d9afd4a a124cb4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | """
Sex Chat Detection Web App using User-Word Bipartite Graph
Author: Ramaguru Radhakrishnan
Date: 16th October 2025
Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory,
compute metrics, and visualize the user-word graph efficiently without hitting storage limits.
"""
import networkx as nx
import matplotlib.pyplot as plt
import re
import json
import gradio as gr
import pandas as pd
from io import BytesIO
# -------------------------------
# Load Lexicon from JSON
# -------------------------------
with open("data.json") as f:
lexicon_data = json.load(f)
LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data}
SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3}
# -------------------------------
# Helper Functions
# -------------------------------
def tokenize(text):
"""Tokenize input text into lowercase words."""
return re.findall(r'\b[a-zA-Z]+\b', str(text).lower())
def compute_URI(chats):
"""Compute User Risk Index and Bipartite Graph from chat data."""
B = nx.Graph()
user_nodes = set()
word_nodes = set()
for sender, receiver, message in chats:
words = tokenize(message)
for user in [sender, receiver]:
user_nodes.add(user)
for word in words:
word_nodes.add(word)
weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1)
if B.has_edge(user, word):
B[user][word]["weight"] += weight
else:
B.add_edge(user, word, weight=weight)
# Compute URI
user_risk = {}
for user in user_nodes:
edges = B.edges(user, data=True)
if not edges:
continue
total_weight = sum(d["weight"] for _, _, d in edges)
connected_words = len(edges)
uri = total_weight * (connected_words / max(1, len(word_nodes)))
user_risk[user] = round(uri, 2)
# Conversation-level risk
conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0
return user_risk, conv_risk, B
def get_top_words(user_risk, B, top_n=5):
"""Return top contributing words per user."""
top_words = {}
for user in user_risk:
edges = B.edges(user, data=True)
sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)
words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]]
top_words[user] = words
return top_words
def plot_graph(B, user_risk):
"""Return matplotlib figure of user-word bipartite graph and close to free memory."""
fig = plt.figure(figsize=(10, 8))
pos = nx.spring_layout(B, k=0.8)
node_colors = []
for n in B.nodes():
if n in user_risk:
risk = user_risk.get(n, 0)
alpha = min(0.2 + risk / 50, 1)
node_colors.append((1, 0, 0, alpha))
else:
node_colors.append('lightblue')
nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800,
font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()])
plt.title("User-Word Bipartite Graph with Risk Heatmap")
fig_canvas = fig
plt.close(fig) # release memory
return fig_canvas
# -------------------------------
# Gradio Interface Function
# -------------------------------
def analyze_chat(file_bytes):
"""
Upload CSV or JSON chat file with columns: sender, receiver, message.
Processes entirely in memory to avoid storage issues.
"""
try:
# Manual file size check (10 MB max)
if len(file_bytes) > 10_000_000:
return "File too large. Max size is 10 MB.", None
# Try CSV first, fallback to JSON
try:
df = pd.read_csv(BytesIO(file_bytes))
except:
try:
df = pd.read_json(BytesIO(file_bytes))
except:
return "Unsupported file format. Please upload CSV or JSON.", None
if not all(col in df.columns for col in ["sender", "receiver", "message"]):
return "File must contain columns: sender, receiver, message", None
chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None))
user_risk, conv_risk, B = compute_URI(chats)
top_words = get_top_words(user_risk, B)
status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected"
# Prepare metrics text
metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n"
for user, uri in user_risk.items():
metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n"
fig = plot_graph(B, user_risk)
return metrics_text, fig
except Exception as e:
return f"Error processing file: {e}", None
# -------------------------------
# Gradio App Launch
# -------------------------------
with gr.Blocks(title="Sex Chat Detection") as iface:
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary')
metrics_output = gr.Textbox(label="Metrics", lines=15)
with gr.Column(scale=1):
graph_output = gr.Plot(label="Risk Graph")
# Connect inputs and outputs
file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output])
iface.launch()
|