Spaces:

namchain
/

STOP

Sleeping

File size: 5,488 Bytes

d9afd4a
 
 
 
 
d629c53
 
d9afd4a
 
 
 
 
 
 
 
d629c53
d9afd4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d629c53
 
d9afd4a
d629c53
d9afd4a
 
 
 
 
 
 
 
d629c53
d9afd4a
 
 
d629c53
 
 
 
d9afd4a
 
d629c53
d9afd4a
7a7dd94
d9afd4a
d629c53
 
d9afd4a
 
df40255
7a7dd94
df40255
 
7a7dd94
 
 
 
 
 
 
 
d629c53
d9afd4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d629c53
d9afd4a
a124cb4
 
 
 
 
 
 
 
 
 
d9afd4a
 
a124cb4

"""
Sex Chat Detection Web App using User-Word Bipartite Graph
Author: Ramaguru Radhakrishnan
Date: 16th October 2025

Hugging Face / Gradio deployment: process uploaded chat CSV/JSON in memory,
compute metrics, and visualize the user-word graph efficiently without hitting storage limits.
"""

import networkx as nx
import matplotlib.pyplot as plt
import re
import json
import gradio as gr
import pandas as pd
from io import BytesIO

# -------------------------------
# Load Lexicon from JSON
# -------------------------------
with open("data.json") as f:
    lexicon_data = json.load(f)

LEXICON = {entry["word"]: entry["severity"] for entry in lexicon_data}
SEVERITY_WEIGHT = {"low": 1, "medium": 2, "high": 3}

# -------------------------------
# Helper Functions
# -------------------------------
def tokenize(text):
    """Tokenize input text into lowercase words."""
    return re.findall(r'\b[a-zA-Z]+\b', str(text).lower())

def compute_URI(chats):
    """Compute User Risk Index and Bipartite Graph from chat data."""
    B = nx.Graph()
    user_nodes = set()
    word_nodes = set()
    
    for sender, receiver, message in chats:
        words = tokenize(message)
        for user in [sender, receiver]:
            user_nodes.add(user)
            for word in words:
                word_nodes.add(word)
                weight = SEVERITY_WEIGHT.get(LEXICON.get(word, "low"), 1)
                if B.has_edge(user, word):
                    B[user][word]["weight"] += weight
                else:
                    B.add_edge(user, word, weight=weight)
    
    # Compute URI
    user_risk = {}
    for user in user_nodes:
        edges = B.edges(user, data=True)
        if not edges:
            continue
        total_weight = sum(d["weight"] for _, _, d in edges)
        connected_words = len(edges)
        uri = total_weight * (connected_words / max(1, len(word_nodes)))
        user_risk[user] = round(uri, 2)
    
    # Conversation-level risk
    conv_risk = round(sum(user_risk.values()) / len(user_risk), 2) if user_risk else 0
    return user_risk, conv_risk, B

def get_top_words(user_risk, B, top_n=5):
    """Return top contributing words per user."""
    top_words = {}
    for user in user_risk:
        edges = B.edges(user, data=True)
        sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)
        words = [f"{w}({d['weight']})" for _, w, d in sorted_edges[:top_n]]
        top_words[user] = words
    return top_words

def plot_graph(B, user_risk):
    """Return matplotlib figure of user-word bipartite graph and close to free memory."""
    fig = plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(B, k=0.8)
    
    node_colors = []
    for n in B.nodes():
        if n in user_risk:
            risk = user_risk.get(n, 0)
            alpha = min(0.2 + risk / 50, 1)
            node_colors.append((1, 0, 0, alpha))
        else:
            node_colors.append('lightblue')
    
    nx.draw(B, pos, with_labels=True, node_color=node_colors, node_size=800,
            font_size=10, width=[B[u][v]['weight'] for u, v in B.edges()])
    plt.title("User-Word Bipartite Graph with Risk Heatmap")
    
    fig_canvas = fig
    plt.close(fig)  # release memory
    return fig_canvas

# -------------------------------
# Gradio Interface Function
# -------------------------------
def analyze_chat(file_bytes):
    """
    Upload CSV or JSON chat file with columns: sender, receiver, message.
    Processes entirely in memory to avoid storage issues.
    """
    try:
        # Manual file size check (10 MB max)
        if len(file_bytes) > 10_000_000:
            return "File too large. Max size is 10 MB.", None
        
        # Try CSV first, fallback to JSON
        try:
            df = pd.read_csv(BytesIO(file_bytes))
        except:
            try:
                df = pd.read_json(BytesIO(file_bytes))
            except:
                return "Unsupported file format. Please upload CSV or JSON.", None
        
        if not all(col in df.columns for col in ["sender", "receiver", "message"]):
            return "File must contain columns: sender, receiver, message", None
        
        chats = list(df[["sender", "receiver", "message"]].itertuples(index=False, name=None))
        user_risk, conv_risk, B = compute_URI(chats)
        top_words = get_top_words(user_risk, B)
        status = "Sex Chat Detected" if conv_risk >= 10 else "Not Detected"
        
        # Prepare metrics text
        metrics_text = f"Conversation Risk Index: {conv_risk}\nStatus: {status}\n\nUser Risk Index:\n"
        for user, uri in user_risk.items():
            metrics_text += f"{user}: URI = {uri}, Top Words: {', '.join(top_words[user])}\n"
        
        fig = plot_graph(B, user_risk)
        return metrics_text, fig
    
    except Exception as e:
        return f"Error processing file: {e}", None

# -------------------------------
# Gradio App Launch
# -------------------------------
with gr.Blocks(title="Sex Chat Detection") as iface:
    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload Chat CSV/JSON", file_types=['.csv', '.json'], type='binary')
            metrics_output = gr.Textbox(label="Metrics", lines=15)
        with gr.Column(scale=1):
            graph_output = gr.Plot(label="Risk Graph")

    # Connect inputs and outputs
    file_input.change(fn=analyze_chat, inputs=file_input, outputs=[metrics_output, graph_output])

iface.launch()