Spaces:

namchain
/

STOP

Sleeping

App Files Files Community

STOP / app.py

ramagururadhakrishnan

Position of the Output area

a124cb4 verified 6 months ago

raw

history blame contribute delete

5.49 kB

	"""
	Sex Chat Detection Web App using User-Word Bipartite Graph
	Author: Ramaguru Radhakrishnan
	Date: 16th October 2025

	Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory,
	compute metrics, and visualize the user-word graph efficiently without hitting storage limits.
	"""

	import networkx as nx
	import matplotlib.pyplot as plt
	import re
	import json
	import gradio as gr
	import pandas as pd
	from io import BytesIO

	# -------------------------------
	# Load Lexicon from JSON
	# -------------------------------
	with open("data.json") as f:
	lexicon_data = json.load(f)

	LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data}
	SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3}

	# -------------------------------
	# Helper Functions
	# -------------------------------
	def tokenize(text):
	"""Tokenize input text into lowercase words."""
	return re.findall(r'\b[a-zA-Z]+\b', str(text).lower())

	def compute_URI(chats):
	"""Compute User Risk Index and Bipartite Graph from chat data."""
	B = nx.Graph()
	user_nodes = set()
	word_nodes = set()

	for sender, receiver, message in chats:
	words = tokenize(message)
	for user in [sender, receiver]:
	user_nodes.add(user)
	for word in words:
	word_nodes.add(word)
	weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1)
	if B.has_edge(user, word):
	B[user][word]["weight"] += weight
	else:
	B.add_edge(user, word, weight=weight)

	# Compute URI
	user_risk = {}
	for user in user_nodes:
	edges = B.edges(user, data=True)
	if not edges:
	continue
	total_weight = sum(d["weight"] for _, _, d in edges)
	connected_words = len(edges)
	uri = total_weight * (connected_words / max(1, len(word_nodes)))
	user_risk[user] = round(uri, 2)

	# Conversation-level risk
	conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0
	return user_risk, conv_risk, B

	def get_top_words(user_risk, B, top_n=5):
	"""Return top contributing words per user."""
	top_words = {}
	for user in user_risk:
	edges = B.edges(user, data=True)
	sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)
	words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]]
	top_words[user] = words
	return top_words

	def plot_graph(B, user_risk):
	"""Return matplotlib figure of user-word bipartite graph and close to free memory."""
	fig = plt.figure(figsize=(10, 8))
	pos = nx.spring_layout(B, k=0.8)

	node_colors = []
	for n in B.nodes():
	if n in user_risk:
	risk = user_risk.get(n, 0)
	alpha = min(0.2 + risk / 50, 1)
	node_colors.append((1, 0, 0, alpha))
	else:
	node_colors.append('lightblue')

	nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800,
	font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()])
	plt.title("User-Word Bipartite Graph with Risk Heatmap")

	fig_canvas = fig
	plt.close(fig) # release memory
	return fig_canvas

	# -------------------------------
	# Gradio Interface Function
	# -------------------------------
	def analyze_chat(file_bytes):
	"""
	Upload CSV or JSON chat file with columns: sender, receiver, message.
	Processes entirely in memory to avoid storage issues.
	"""
	try:
	# Manual file size check (10 MB max)
	if len(file_bytes) > 10_000_000:
	return "File too large. Max size is 10 MB.", None

	# Try CSV first, fallback to JSON
	try:
	df = pd.read_csv(BytesIO(file_bytes))
	except:
	try:
	df = pd.read_json(BytesIO(file_bytes))
	except:
	return "Unsupported file format. Please upload CSV or JSON.", None

	if not all(col in df.columns for col in ["sender", "receiver", "message"]):
	return "File must contain columns: sender, receiver, message", None

	chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None))
	user_risk, conv_risk, B = compute_URI(chats)
	top_words = get_top_words(user_risk, B)
	status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected"

	# Prepare metrics text
	metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n"
	for user, uri in user_risk.items():
	metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n"

	fig = plot_graph(B, user_risk)
	return metrics_text, fig

	except Exception as e:
	return f"Error processing file: {e}", None

	# -------------------------------
	# Gradio App Launch
	# -------------------------------
	with gr.Blocks(title="Sex Chat Detection") as iface:
	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary')
	metrics_output = gr.Textbox(label="Metrics", lines=15)
	with gr.Column(scale=1):
	graph_output = gr.Plot(label="Risk Graph")

	# Connect inputs and outputs
	file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output])

	iface.launch()