Spaces:

Alimubariz124
/

Topic_modelling

Runtime error

App Files Files Community

Alimubariz124 commited on Apr 30, 2025

Commit

eda8ec5

verified ·

1 Parent(s): d85637d

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -97

app.py CHANGED Viewed

@@ -1,117 +1,176 @@
 import gradio as gr
 import pandas as pd
-import numpy as np
 from sklearn.cluster import KMeans
 from sentence_transformers import SentenceTransformer
-import requests
 import os
 import io
 # === CONFIGURATION ===
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")  # Set in Hugging Face Secrets
-FALCON_MODEL = "tiiuae/falcon-7b-instruct"
-# === STEP 1: CLUSTERING MODEL ===
-embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-def get_embeddings(texts):
-    return embedding_model.encode(texts, show_progress_bar=False)
-def cluster_texts(texts, n_clusters=10):
-    embeddings = get_embeddings(texts)
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-    clusters = kmeans.fit_predict(embeddings)
-    return clusters
-# === STEP 2: FALCON-BASED LABELING ===
-def query_falcon(prompt):
-    if not HF_API_TOKEN:
-        return "API Token missing"
-    headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
-    API_URL = f"https://api-inference.huggingface.co/models/{FALCON_MODEL}"
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": 50,
-            "temperature": 0.3,
-            "do_sample": True
-        }
-    }
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload)
-        return response.json()[0]['generated_text'].strip()
-    except Exception as e:
-        return f"Error calling Falcon: {str(e)}"
-def generate_topic_labels(texts, clusters, n_clusters=10):
-    cluster_samples = {}
-    for i in range(n_clusters):
-        samples = [texts[j] for j in range(len(clusters)) if clusters[j] == i][:3]
-        cluster_samples[i] = "\n".join(samples)
-    topic_labels = {}
-    for cid, sample_text in cluster_samples.items():
-        prompt = f"""
-You are an expert in qualitative analysis.
-Given the following customer feedback examples from one group, describe the overall theme in 1–2 words.
-EXAMPLES:
-{sample_text}
-TOPIC LABEL:
-        """
-        label = query_falcon(prompt)
-        topic_labels[cid] = label
-    return topic_labels
-# === STEP 3: REFINEMENT LOOP UTILS ===
 session = {
     "original_df": None,
     "current_df": None,
     "context": "",
-    "topic_labels": {}
 }
-def read_csv_file(file_obj):
-    """Robust CSV reader that handles both string paths and file-like objects"""
-    if isinstance(file_obj, str):
-        return pd.read_csv(file_obj)
-    else:
-        content = file_obj.read().decode("utf-8")
-        return pd.read_csv(io.StringIO(content))
 def run_initial_analysis(csv_file, context_input, n_clusters=10):
     try:
-        df = read_csv_file(csv_file)
     except Exception as e:
         return f"Error reading CSV: {str(e)}", "", ""
-    if 'text' not in df.columns:
-        return "CSV must contain a column named 'text'", "", ""
     session['original_df'] = df.copy()
     session['context'] = context_input
     texts = df['text'].tolist()
     clusters = cluster_texts(texts, n_clusters)
     df['cluster'] = clusters
-    topic_labels = generate_topic_labels(texts, clusters, n_clusters)
     df['label'] = df['cluster'].map(topic_labels)
     session['current_df'] = df
     output = io.StringIO()
     df.to_csv(output, index=False)
     csv_str = output.getvalue()
     return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
 def refine_labels(feedback_input):
     if session['current_df'] is None:
         return "No data found. Please run initial analysis first.", "", ""
@@ -135,7 +194,8 @@ Instructions:
 Return only the revised labels, one per line.
 """
-    response = query_falcon(prompt)
     new_labels = response.strip().split('\n')[:len(df)]
     df['label'] = new_labels[:len(df)]
@@ -147,28 +207,10 @@ Return only the revised labels, one per line.
     return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
-import gradio as gr
-import pandas as pd
-# === Placeholder Functions (replace with actual logic) ===
-def run_initial_analysis(file, context, n_clusters):
-    try:
-        df = pd.read_csv(file.name, encoding='latin1')
-        preview = df.head(10).to_csv(index=False)
-        # Dummy status and download file path (replace with real logic)
-        status = f"Successfully processed {len(df)} rows into {n_clusters} topics."
-        return status, file, preview
-    except Exception as e:
-        return f"Error during initial analysis: {e}", None, ""
-def refine_labels(feedback):
-    # Dummy response (replace with real logic)
-    return f"Refined using feedback: {feedback}", None, ""
 # === GRADIO UI ===
-with gr.Blocks(title="Falcon Topic Modeling") as demo:
-    gr.Markdown("# 🎯 Falcon-Powered Topic Modeling")
-    gr.Markdown("Upload verbatims, get topics, and refine iteratively.")
     with gr.Row():
         with gr.Column():
@@ -189,4 +231,4 @@ with gr.Blocks(title="Falcon Topic Modeling") as demo:
     refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import pandas as pd
 from sklearn.cluster import KMeans
 from sentence_transformers import SentenceTransformer
+from keybert import KeyBERT
+import numpy as np
 import os
 import io
+from crewai import Agent, Task, Crew
+from langchain_community.llms import HuggingFaceHub
 # === CONFIGURATION ===
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN")  # Set this in environment
+MODEL_NAME = "meta-llama/Llama-3-8b-chat-hf"  # Also supports Mistral, Qwen, etc.
+# Setup LLM via HuggingFace Hub
+llm = HuggingFaceHub(
+    repo_id=MODEL_NAME,
+    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
+    model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
+)
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")
 session = {
     "original_df": None,
     "current_df": None,
     "context": "",
+    "topic_labels": {},
+    "keywords": {},
+    "clusters_verified": False
 }
+# === AGENTS ===
+keyword_agent = Agent(
+    role='Keyword Analyst',
+    goal='Extract top 5 keywords from a group of similar texts',
+    backstory="""You are a skilled keyword analyst who identifies patterns in text data.
+    You focus on extracting concise, meaningful keywords that represent the core themes.""",
+    llm=llm,
+    verbose=False
+)
+labeling_agent = Agent(
+    role='Topic Labeler',
+    goal='Generate a short label for a group of similar texts based on context',
+    backstory="""You are a professional theme summarizer. Given example texts and a user context,
+    you generate clear and actionable topic labels.""",
+    llm=llm,
+    verbose=False
+)
+validation_agent = Agent(
+    role='QA Analyst',
+    goal='Evaluate whether the clustered topics and keywords form coherent themes',
+    backstory="""You are a quality assurance expert evaluating if generated topics make sense.
+    You return 'Approved' or 'Needs Refinement' based on coherence.""",
+    llm=llm,
+    verbose=False
+)
+finalizer_agent = Agent(
+    role='Data Engineer',
+    goal='Prepare final labeled dataset for download',
+    backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
+    llm=llm,
+    verbose=False
+)
+# === TASKS ===
+def create_tasks(text_samples, context_input):
+    extract_keywords_task = Task(
+        description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
+        agent=keyword_agent,
+        expected_output="Comma-separated list of keywords"
+    )
+    label_topic_task = Task(
+        description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
+        agent=labeling_agent,
+        expected_output="A single line topic label"
+    )
+    validate_cluster_task = Task(
+        description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
+        agent=validation_agent,
+        expected_output="'Approved' or 'Needs Refinement'"
+    )
+    finalize_data_task = Task(
+        description="Take the approved labeled DataFrame and format it for download.",
+        agent=finalizer_agent,
+        expected_output="Final CSV content as string"
+    )
+    return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task
+# === CLUSTERING ===
+def cluster_texts(texts, n_clusters=10):
+    embeddings = embedding_model.encode(texts, show_progress_bar=False)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    return kmeans.fit_predict(embeddings)
+# === FULL PIPELINE FUNCTION ===
 def run_initial_analysis(csv_file, context_input, n_clusters=10):
     try:
+        df = pd.read_csv(csv_file.name)
     except Exception as e:
         return f"Error reading CSV: {str(e)}", "", ""
     session['original_df'] = df.copy()
     session['context'] = context_input
+    if 'text' not in df.columns:
+        return "CSV must contain a column named 'text'", "", ""
     texts = df['text'].tolist()
     clusters = cluster_texts(texts, n_clusters)
     df['cluster'] = clusters
+    topic_labels = {}
+    keywords_map = {}
+    for i in range(n_clusters):
+        cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
+        if not cluster_texts_i:
+            continue
+        samples = "\n".join(cluster_texts_i[:3])
+        # Create CrewAI Tasks for this cluster
+        ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)
+        # Run keyword extraction
+        crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
+        keyword_result = crew_keyword.kickoff()
+        keywords_map[i] = keyword_result.raw.strip()
+        # Run labeling
+        crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
+        label_result = crew_label.kickoff()
+        topic_labels[i] = label_result.raw.strip()
+    # Assign labels and keywords back to DataFrame
     df['label'] = df['cluster'].map(topic_labels)
+    df['keywords'] = df['cluster'].map(keywords_map)
     session['current_df'] = df
+    # Validate Clusters
+    validation_prompts = []
+    for cid in topic_labels:
+        val_task = Task(
+            description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
+            agent=validation_agent,
+            expected_output="'Approved' or 'Needs Refinement'"
+        )
+        crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
+        res = crew_validate.kickoff()
+        if "Needs" in res.raw:
+            session["clusters_verified"] = False
+            break
+    else:
+        session["clusters_verified"] = True
     output = io.StringIO()
     df.to_csv(output, index=False)
     csv_str = output.getvalue()
     return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
+# === REFINEMENT FUNCTION ===
 def refine_labels(feedback_input):
     if session['current_df'] is None:
         return "No data found. Please run initial analysis first.", "", ""
 Return only the revised labels, one per line.
 """
+    # Simulating refinement using the same LLM
+    response = llm(prompt)
     new_labels = response.strip().split('\n')[:len(df)]
     df['label'] = new_labels[:len(df)]
     return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
 # === GRADIO UI ===
+with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
+    gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
+    gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")
     with gr.Row():
         with gr.Column():
     refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
 if __name__ == "__main__":
+    demo.launch()