Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,117 +1,176 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
-
import numpy as np
|
| 4 |
from sklearn.cluster import KMeans
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
-
import
|
|
|
|
| 7 |
import os
|
| 8 |
import io
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# === CONFIGURATION ===
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
# === STEP 1: CLUSTERING MODEL ===
|
| 15 |
-
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 16 |
-
|
| 17 |
-
def get_embeddings(texts):
|
| 18 |
-
return embedding_model.encode(texts, show_progress_bar=False)
|
| 19 |
-
|
| 20 |
-
def cluster_texts(texts, n_clusters=10):
|
| 21 |
-
embeddings = get_embeddings(texts)
|
| 22 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 23 |
-
clusters = kmeans.fit_predict(embeddings)
|
| 24 |
-
return clusters
|
| 25 |
-
|
| 26 |
-
# === STEP 2: FALCON-BASED LABELING ===
|
| 27 |
-
def query_falcon(prompt):
|
| 28 |
-
if not HF_API_TOKEN:
|
| 29 |
-
return "API Token missing"
|
| 30 |
-
|
| 31 |
-
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
|
| 32 |
-
API_URL = f"https://api-inference.huggingface.co/models/{FALCON_MODEL}"
|
| 33 |
-
|
| 34 |
-
payload = {
|
| 35 |
-
"inputs": prompt,
|
| 36 |
-
"parameters": {
|
| 37 |
-
"max_new_tokens": 50,
|
| 38 |
-
"temperature": 0.3,
|
| 39 |
-
"do_sample": True
|
| 40 |
-
}
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
response = requests.post(API_URL, headers=headers, json=payload)
|
| 45 |
-
return response.json()[0]['generated_text'].strip()
|
| 46 |
-
except Exception as e:
|
| 47 |
-
return f"Error calling Falcon: {str(e)}"
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
topic_labels = {}
|
| 56 |
-
|
| 57 |
-
for cid, sample_text in cluster_samples.items():
|
| 58 |
-
prompt = f"""
|
| 59 |
-
You are an expert in qualitative analysis.
|
| 60 |
-
Given the following customer feedback examples from one group, describe the overall theme in 1–2 words.
|
| 61 |
-
|
| 62 |
-
EXAMPLES:
|
| 63 |
-
{sample_text}
|
| 64 |
-
|
| 65 |
-
TOPIC LABEL:
|
| 66 |
-
"""
|
| 67 |
-
label = query_falcon(prompt)
|
| 68 |
-
topic_labels[cid] = label
|
| 69 |
|
| 70 |
-
|
|
|
|
| 71 |
|
| 72 |
-
# === STEP 3: REFINEMENT LOOP UTILS ===
|
| 73 |
session = {
|
| 74 |
"original_df": None,
|
| 75 |
"current_df": None,
|
| 76 |
"context": "",
|
| 77 |
-
"topic_labels": {}
|
|
|
|
|
|
|
| 78 |
}
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
|
|
|
| 88 |
def run_initial_analysis(csv_file, context_input, n_clusters=10):
|
| 89 |
try:
|
| 90 |
-
df =
|
| 91 |
except Exception as e:
|
| 92 |
return f"Error reading CSV: {str(e)}", "", ""
|
| 93 |
|
| 94 |
-
if 'text' not in df.columns:
|
| 95 |
-
return "CSV must contain a column named 'text'", "", ""
|
| 96 |
-
|
| 97 |
session['original_df'] = df.copy()
|
| 98 |
session['context'] = context_input
|
| 99 |
|
|
|
|
|
|
|
|
|
|
| 100 |
texts = df['text'].tolist()
|
| 101 |
clusters = cluster_texts(texts, n_clusters)
|
| 102 |
df['cluster'] = clusters
|
| 103 |
|
| 104 |
-
topic_labels =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
df['label'] = df['cluster'].map(topic_labels)
|
|
|
|
| 106 |
|
| 107 |
session['current_df'] = df
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
output = io.StringIO()
|
| 110 |
df.to_csv(output, index=False)
|
| 111 |
csv_str = output.getvalue()
|
| 112 |
|
| 113 |
return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
|
| 114 |
|
|
|
|
| 115 |
def refine_labels(feedback_input):
|
| 116 |
if session['current_df'] is None:
|
| 117 |
return "No data found. Please run initial analysis first.", "", ""
|
|
@@ -135,7 +194,8 @@ Instructions:
|
|
| 135 |
Return only the revised labels, one per line.
|
| 136 |
"""
|
| 137 |
|
| 138 |
-
|
|
|
|
| 139 |
new_labels = response.strip().split('\n')[:len(df)]
|
| 140 |
|
| 141 |
df['label'] = new_labels[:len(df)]
|
|
@@ -147,28 +207,10 @@ Return only the revised labels, one per line.
|
|
| 147 |
|
| 148 |
return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
|
| 149 |
|
| 150 |
-
import gradio as gr
|
| 151 |
-
import pandas as pd
|
| 152 |
-
|
| 153 |
-
# === Placeholder Functions (replace with actual logic) ===
|
| 154 |
-
def run_initial_analysis(file, context, n_clusters):
|
| 155 |
-
try:
|
| 156 |
-
df = pd.read_csv(file.name, encoding='latin1')
|
| 157 |
-
preview = df.head(10).to_csv(index=False)
|
| 158 |
-
# Dummy status and download file path (replace with real logic)
|
| 159 |
-
status = f"Successfully processed {len(df)} rows into {n_clusters} topics."
|
| 160 |
-
return status, file, preview
|
| 161 |
-
except Exception as e:
|
| 162 |
-
return f"Error during initial analysis: {e}", None, ""
|
| 163 |
-
|
| 164 |
-
def refine_labels(feedback):
|
| 165 |
-
# Dummy response (replace with real logic)
|
| 166 |
-
return f"Refined using feedback: {feedback}", None, ""
|
| 167 |
-
|
| 168 |
# === GRADIO UI ===
|
| 169 |
-
with gr.Blocks(title="
|
| 170 |
-
gr.Markdown("# 🎯
|
| 171 |
-
gr.Markdown("Upload verbatims, get topics
|
| 172 |
|
| 173 |
with gr.Row():
|
| 174 |
with gr.Column():
|
|
@@ -189,4 +231,4 @@ with gr.Blocks(title="Falcon Topic Modeling") as demo:
|
|
| 189 |
refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from sklearn.cluster import KMeans
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from keybert import KeyBERT
|
| 6 |
+
import numpy as np
|
| 7 |
import os
|
| 8 |
import io
|
| 9 |
+
from crewai import Agent, Task, Crew
|
| 10 |
+
from langchain_community.llms import HuggingFaceHub
|
| 11 |
|
| 12 |
# === CONFIGURATION ===
|
| 13 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN") # Set this in environment
|
| 14 |
+
MODEL_NAME = "meta-llama/Llama-3-8b-chat-hf" # Also supports Mistral, Qwen, etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# Setup LLM via HuggingFace Hub
|
| 17 |
+
llm = HuggingFaceHub(
|
| 18 |
+
repo_id=MODEL_NAME,
|
| 19 |
+
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
|
| 20 |
+
model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
|
| 21 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 24 |
+
keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")
|
| 25 |
|
|
|
|
| 26 |
session = {
|
| 27 |
"original_df": None,
|
| 28 |
"current_df": None,
|
| 29 |
"context": "",
|
| 30 |
+
"topic_labels": {},
|
| 31 |
+
"keywords": {},
|
| 32 |
+
"clusters_verified": False
|
| 33 |
}
|
| 34 |
|
| 35 |
+
# === AGENTS ===
|
| 36 |
+
keyword_agent = Agent(
|
| 37 |
+
role='Keyword Analyst',
|
| 38 |
+
goal='Extract top 5 keywords from a group of similar texts',
|
| 39 |
+
backstory="""You are a skilled keyword analyst who identifies patterns in text data.
|
| 40 |
+
You focus on extracting concise, meaningful keywords that represent the core themes.""",
|
| 41 |
+
llm=llm,
|
| 42 |
+
verbose=False
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
labeling_agent = Agent(
|
| 46 |
+
role='Topic Labeler',
|
| 47 |
+
goal='Generate a short label for a group of similar texts based on context',
|
| 48 |
+
backstory="""You are a professional theme summarizer. Given example texts and a user context,
|
| 49 |
+
you generate clear and actionable topic labels.""",
|
| 50 |
+
llm=llm,
|
| 51 |
+
verbose=False
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
validation_agent = Agent(
|
| 55 |
+
role='QA Analyst',
|
| 56 |
+
goal='Evaluate whether the clustered topics and keywords form coherent themes',
|
| 57 |
+
backstory="""You are a quality assurance expert evaluating if generated topics make sense.
|
| 58 |
+
You return 'Approved' or 'Needs Refinement' based on coherence.""",
|
| 59 |
+
llm=llm,
|
| 60 |
+
verbose=False
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
finalizer_agent = Agent(
|
| 64 |
+
role='Data Engineer',
|
| 65 |
+
goal='Prepare final labeled dataset for download',
|
| 66 |
+
backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
|
| 67 |
+
llm=llm,
|
| 68 |
+
verbose=False
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# === TASKS ===
|
| 72 |
+
def create_tasks(text_samples, context_input):
|
| 73 |
+
extract_keywords_task = Task(
|
| 74 |
+
description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
|
| 75 |
+
agent=keyword_agent,
|
| 76 |
+
expected_output="Comma-separated list of keywords"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
label_topic_task = Task(
|
| 80 |
+
description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
|
| 81 |
+
agent=labeling_agent,
|
| 82 |
+
expected_output="A single line topic label"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
validate_cluster_task = Task(
|
| 86 |
+
description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
|
| 87 |
+
agent=validation_agent,
|
| 88 |
+
expected_output="'Approved' or 'Needs Refinement'"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
finalize_data_task = Task(
|
| 92 |
+
description="Take the approved labeled DataFrame and format it for download.",
|
| 93 |
+
agent=finalizer_agent,
|
| 94 |
+
expected_output="Final CSV content as string"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task
|
| 98 |
+
|
| 99 |
+
# === CLUSTERING ===
|
| 100 |
+
def cluster_texts(texts, n_clusters=10):
|
| 101 |
+
embeddings = embedding_model.encode(texts, show_progress_bar=False)
|
| 102 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 103 |
+
return kmeans.fit_predict(embeddings)
|
| 104 |
|
| 105 |
+
# === FULL PIPELINE FUNCTION ===
|
| 106 |
def run_initial_analysis(csv_file, context_input, n_clusters=10):
|
| 107 |
try:
|
| 108 |
+
df = pd.read_csv(csv_file.name)
|
| 109 |
except Exception as e:
|
| 110 |
return f"Error reading CSV: {str(e)}", "", ""
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
session['original_df'] = df.copy()
|
| 113 |
session['context'] = context_input
|
| 114 |
|
| 115 |
+
if 'text' not in df.columns:
|
| 116 |
+
return "CSV must contain a column named 'text'", "", ""
|
| 117 |
+
|
| 118 |
texts = df['text'].tolist()
|
| 119 |
clusters = cluster_texts(texts, n_clusters)
|
| 120 |
df['cluster'] = clusters
|
| 121 |
|
| 122 |
+
topic_labels = {}
|
| 123 |
+
keywords_map = {}
|
| 124 |
+
|
| 125 |
+
for i in range(n_clusters):
|
| 126 |
+
cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
|
| 127 |
+
if not cluster_texts_i:
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
samples = "\n".join(cluster_texts_i[:3])
|
| 131 |
+
|
| 132 |
+
# Create CrewAI Tasks for this cluster
|
| 133 |
+
ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)
|
| 134 |
+
|
| 135 |
+
# Run keyword extraction
|
| 136 |
+
crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
|
| 137 |
+
keyword_result = crew_keyword.kickoff()
|
| 138 |
+
keywords_map[i] = keyword_result.raw.strip()
|
| 139 |
+
|
| 140 |
+
# Run labeling
|
| 141 |
+
crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
|
| 142 |
+
label_result = crew_label.kickoff()
|
| 143 |
+
topic_labels[i] = label_result.raw.strip()
|
| 144 |
+
|
| 145 |
+
# Assign labels and keywords back to DataFrame
|
| 146 |
df['label'] = df['cluster'].map(topic_labels)
|
| 147 |
+
df['keywords'] = df['cluster'].map(keywords_map)
|
| 148 |
|
| 149 |
session['current_df'] = df
|
| 150 |
|
| 151 |
+
# Validate Clusters
|
| 152 |
+
validation_prompts = []
|
| 153 |
+
for cid in topic_labels:
|
| 154 |
+
val_task = Task(
|
| 155 |
+
description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
|
| 156 |
+
agent=validation_agent,
|
| 157 |
+
expected_output="'Approved' or 'Needs Refinement'"
|
| 158 |
+
)
|
| 159 |
+
crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
|
| 160 |
+
res = crew_validate.kickoff()
|
| 161 |
+
if "Needs" in res.raw:
|
| 162 |
+
session["clusters_verified"] = False
|
| 163 |
+
break
|
| 164 |
+
else:
|
| 165 |
+
session["clusters_verified"] = True
|
| 166 |
+
|
| 167 |
output = io.StringIO()
|
| 168 |
df.to_csv(output, index=False)
|
| 169 |
csv_str = output.getvalue()
|
| 170 |
|
| 171 |
return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
|
| 172 |
|
| 173 |
+
# === REFINEMENT FUNCTION ===
|
| 174 |
def refine_labels(feedback_input):
|
| 175 |
if session['current_df'] is None:
|
| 176 |
return "No data found. Please run initial analysis first.", "", ""
|
|
|
|
| 194 |
Return only the revised labels, one per line.
|
| 195 |
"""
|
| 196 |
|
| 197 |
+
# Simulating refinement using the same LLM
|
| 198 |
+
response = llm(prompt)
|
| 199 |
new_labels = response.strip().split('\n')[:len(df)]
|
| 200 |
|
| 201 |
df['label'] = new_labels[:len(df)]
|
|
|
|
| 207 |
|
| 208 |
return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# === GRADIO UI ===
|
| 211 |
+
with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
|
| 212 |
+
gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
|
| 213 |
+
gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")
|
| 214 |
|
| 215 |
with gr.Row():
|
| 216 |
with gr.Column():
|
|
|
|
| 231 |
refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
|
| 232 |
|
| 233 |
if __name__ == "__main__":
|
| 234 |
+
demo.launch()
|