STOP / app.py
ramagururadhakrishnan's picture
Position of the Output area
a124cb4 verified
"""
Sex Chat Detection Web App using User-Word Bipartite Graph
Author: Ramaguru Radhakrishnan
Date: 16th October 2025
Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory,
compute metrics, and visualize the user-word graph efficiently without hitting storage limits.
"""
import networkx as nx
import matplotlib.pyplot as plt
import re
import json
import gradio as gr
import pandas as pd
from io import BytesIO
# -------------------------------
# Load Lexicon from JSON
# -------------------------------
with open("data.json") as f:
lexicon_data = json.load(f)
LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data}
SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3}
# -------------------------------
# Helper Functions
# -------------------------------
def tokenize(text):
"""Tokenize input text into lowercase words."""
return re.findall(r'\b[a-zA-Z]+\b', str(text).lower())
def compute_URI(chats):
"""Compute User Risk Index and Bipartite Graph from chat data."""
B = nx.Graph()
user_nodes = set()
word_nodes = set()
for sender, receiver, message in chats:
words = tokenize(message)
for user in [sender, receiver]:
user_nodes.add(user)
for word in words:
word_nodes.add(word)
weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1)
if B.has_edge(user, word):
B[user][word]["weight"] += weight
else:
B.add_edge(user, word, weight=weight)
# Compute URI
user_risk = {}
for user in user_nodes:
edges = B.edges(user, data=True)
if not edges:
continue
total_weight = sum(d["weight"] for _, _, d in edges)
connected_words = len(edges)
uri = total_weight * (connected_words / max(1, len(word_nodes)))
user_risk[user] = round(uri, 2)
# Conversation-level risk
conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0
return user_risk, conv_risk, B
def get_top_words(user_risk, B, top_n=5):
"""Return top contributing words per user."""
top_words = {}
for user in user_risk:
edges = B.edges(user, data=True)
sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)
words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]]
top_words[user] = words
return top_words
def plot_graph(B, user_risk):
"""Return matplotlib figure of user-word bipartite graph and close to free memory."""
fig = plt.figure(figsize=(10, 8))
pos = nx.spring_layout(B, k=0.8)
node_colors = []
for n in B.nodes():
if n in user_risk:
risk = user_risk.get(n, 0)
alpha = min(0.2 + risk / 50, 1)
node_colors.append((1, 0, 0, alpha))
else:
node_colors.append('lightblue')
nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800,
font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()])
plt.title("User-Word Bipartite Graph with Risk Heatmap")
fig_canvas = fig
plt.close(fig) # release memory
return fig_canvas
# -------------------------------
# Gradio Interface Function
# -------------------------------
def analyze_chat(file_bytes):
"""
Upload CSV or JSON chat file with columns: sender, receiver, message.
Processes entirely in memory to avoid storage issues.
"""
try:
# Manual file size check (10 MB max)
if len(file_bytes) > 10_000_000:
return "File too large. Max size is 10 MB.", None
# Try CSV first, fallback to JSON
try:
df = pd.read_csv(BytesIO(file_bytes))
except:
try:
df = pd.read_json(BytesIO(file_bytes))
except:
return "Unsupported file format. Please upload CSV or JSON.", None
if not all(col in df.columns for col in ["sender", "receiver", "message"]):
return "File must contain columns: sender, receiver, message", None
chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None))
user_risk, conv_risk, B = compute_URI(chats)
top_words = get_top_words(user_risk, B)
status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected"
# Prepare metrics text
metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n"
for user, uri in user_risk.items():
metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n"
fig = plot_graph(B, user_risk)
return metrics_text, fig
except Exception as e:
return f"Error processing file: {e}", None
# -------------------------------
# Gradio App Launch
# -------------------------------
with gr.Blocks(title="Sex Chat Detection") as iface:
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary')
metrics_output = gr.Textbox(label="Metrics", lines=15)
with gr.Column(scale=1):
graph_output = gr.Plot(label="Risk Graph")
# Connect inputs and outputs
file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output])
iface.launch()