Spaces:

Alimubariz124
/

Topic_modelling

Runtime error

App Files Files Community

Alimubariz124 commited on Apr 30, 2025

Commit

c2fded3

verified ·

1 Parent(s): 9314665

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sentence_transformers import SentenceTransformer
+import requests
+import os
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")  # ✅ GOOD: Read from environment
+# === CONFIGURATION ===
+#HF_API_TOKEN = ""
+FALCON_MODEL = "tiiuae/falcon-7b-instruct"
+# === STEP 1: CLUSTERING MODEL ===
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+def get_embeddings(texts):
+    return embedding_model.encode(texts, show_progress_bar=False)
+def cluster_texts(texts, n_clusters=10):
+    embeddings = get_embeddings(texts)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    clusters = kmeans.fit_predict(embeddings)
+    return clusters
+# === STEP 2: FALCON-BASED LABELING ===
+def query_falcon(prompt):
+    headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
+    API_URL = f"https://api-inference.huggingface.co/models/{FALCON_MODEL}"
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": 50,
+            "temperature": 0.3,
+            "do_sample": True
+        }
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    try:
+        return response.json()[0]['generated_text'].strip()
+    except Exception as e:
+        print(f"Error calling Falcon: {e}")
+        return ""
+def generate_topic_labels(texts, clusters, n_clusters=10):
+    cluster_samples = {}
+    for i in range(n_clusters):
+        samples = [texts[j] for j in range(len(clusters)) if clusters[j] == i][:3]
+        cluster_samples[i] = "\n".join(samples)
+    topic_labels = {}
+    for cid, sample_text in cluster_samples.items():
+        prompt = f"""
+You are an expert in qualitative analysis.
+Given the following customer feedback examples from one group, describe the overall theme in 1–2 words.
+EXAMPLES:
+{sample_text}
+TOPIC LABEL:
+        """
+        label = query_falcon(prompt)
+        topic_labels[cid] = label
+    return topic_labels
+# === STEP 3: REFINEMENT LOOP UTILS ===
+session = {
+    "original_df": None,
+    "current_df": None,
+    "context": "",
+    "topic_labels": {}
+}
+def run_initial_analysis(csv_file, context_input, n_clusters=10):
+    try:
+        df = pd.read_csv(csv_file)
+    except Exception as e:
+        return f"Error reading CSV: {str(e)}", "", ""
+    if 'text' not in df.columns:
+        return "CSV must contain a column named 'text'", "", ""
+    session['original_df'] = df.copy()
+    session['context'] = context_input
+    texts = df['text'].tolist()
+    clusters = cluster_texts(texts, n_clusters)
+    df['cluster'] = clusters
+    topic_labels = generate_topic_labels(texts, clusters, n_clusters)
+    df['label'] = df['cluster'].map(topic_labels)
+    session['current_df'] = df
+    session['topic_labels'] = topic_labels
+    # Save CSV
+    output = io.StringIO()
+    df.to_csv(output, index=False)
+    csv_str = output.getvalue()
+    return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
+def refine_labels(feedback_input):
+    if session['current_df'] is None:
+        return "No data found. Please run initial analysis first.", "", ""
+    df = session['current_df']
+    current_sample = df[['text', 'label']].head(10).to_markdown(index=False)
+    prompt = f"""
+You are helping refine topic labels based on user feedback.
+Current Labels:
+{current_sample}
+User Feedback:
+{feedback_input}
+Task:
+Reassign labels accordingly. Keep output format consistent: one label per line.
+Instructions:
+Return only the revised labels, one per line.
+"""
+    response = query_falcon(prompt)
+    new_labels = response.strip().split('\n')[:len(df)]
+    df['label'] = new_labels[:len(df)]
+    session['current_df'] = df
+    output = io.StringIO()
+    df.to_csv(output, index=False)
+    csv_str = output.getvalue()
+    return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
+# === GRADIO UI ===
+with gr.Blocks(title="Falcon Topic Modeling") as demo:
+    gr.Markdown("# 🎯 Falcon-Powered Topic Modeling")
+    gr.Markdown("Upload verbatims, get topics, and refine iteratively.")
+    with gr.Row():
+        with gr.Column():
+            upload = gr.File(label="Upload CSV ('text' column)")
+            context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.")
+            cluster_slider = gr.Slider(2, 20, value=10, label="Number of Topics")
+            run_btn = gr.Button("Run Initial Analysis")
+        with gr.Column():
+            feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5)
+            refine_btn = gr.Button("Refine Labels")
+    status = gr.Textbox(label="Status")
+    preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10)
+    download = gr.File(label="Download Final Labeled CSV")
+    run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview])
+    refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
+if __name__ == "__main__":
+    demo.launch()