Alimubariz124 commited on
Commit
eda8ec5
·
verified ·
1 Parent(s): d85637d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -97
app.py CHANGED
@@ -1,117 +1,176 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import numpy as np
4
  from sklearn.cluster import KMeans
5
  from sentence_transformers import SentenceTransformer
6
- import requests
 
7
  import os
8
  import io
 
 
9
 
10
  # === CONFIGURATION ===
11
- HF_API_TOKEN = os.getenv("HF_API_TOKEN") # Set in Hugging Face Secrets
12
- FALCON_MODEL = "tiiuae/falcon-7b-instruct"
13
-
14
- # === STEP 1: CLUSTERING MODEL ===
15
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
16
-
17
- def get_embeddings(texts):
18
- return embedding_model.encode(texts, show_progress_bar=False)
19
-
20
- def cluster_texts(texts, n_clusters=10):
21
- embeddings = get_embeddings(texts)
22
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
23
- clusters = kmeans.fit_predict(embeddings)
24
- return clusters
25
-
26
- # === STEP 2: FALCON-BASED LABELING ===
27
- def query_falcon(prompt):
28
- if not HF_API_TOKEN:
29
- return "API Token missing"
30
-
31
- headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
32
- API_URL = f"https://api-inference.huggingface.co/models/{FALCON_MODEL}"
33
-
34
- payload = {
35
- "inputs": prompt,
36
- "parameters": {
37
- "max_new_tokens": 50,
38
- "temperature": 0.3,
39
- "do_sample": True
40
- }
41
- }
42
-
43
- try:
44
- response = requests.post(API_URL, headers=headers, json=payload)
45
- return response.json()[0]['generated_text'].strip()
46
- except Exception as e:
47
- return f"Error calling Falcon: {str(e)}"
48
 
49
- def generate_topic_labels(texts, clusters, n_clusters=10):
50
- cluster_samples = {}
51
- for i in range(n_clusters):
52
- samples = [texts[j] for j in range(len(clusters)) if clusters[j] == i][:3]
53
- cluster_samples[i] = "\n".join(samples)
54
-
55
- topic_labels = {}
56
-
57
- for cid, sample_text in cluster_samples.items():
58
- prompt = f"""
59
- You are an expert in qualitative analysis.
60
- Given the following customer feedback examples from one group, describe the overall theme in 1–2 words.
61
-
62
- EXAMPLES:
63
- {sample_text}
64
-
65
- TOPIC LABEL:
66
- """
67
- label = query_falcon(prompt)
68
- topic_labels[cid] = label
69
 
70
- return topic_labels
 
71
 
72
- # === STEP 3: REFINEMENT LOOP UTILS ===
73
  session = {
74
  "original_df": None,
75
  "current_df": None,
76
  "context": "",
77
- "topic_labels": {}
 
 
78
  }
79
 
80
- def read_csv_file(file_obj):
81
- """Robust CSV reader that handles both string paths and file-like objects"""
82
- if isinstance(file_obj, str):
83
- return pd.read_csv(file_obj)
84
- else:
85
- content = file_obj.read().decode("utf-8")
86
- return pd.read_csv(io.StringIO(content))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
 
88
  def run_initial_analysis(csv_file, context_input, n_clusters=10):
89
  try:
90
- df = read_csv_file(csv_file)
91
  except Exception as e:
92
  return f"Error reading CSV: {str(e)}", "", ""
93
 
94
- if 'text' not in df.columns:
95
- return "CSV must contain a column named 'text'", "", ""
96
-
97
  session['original_df'] = df.copy()
98
  session['context'] = context_input
99
 
 
 
 
100
  texts = df['text'].tolist()
101
  clusters = cluster_texts(texts, n_clusters)
102
  df['cluster'] = clusters
103
 
104
- topic_labels = generate_topic_labels(texts, clusters, n_clusters)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  df['label'] = df['cluster'].map(topic_labels)
 
106
 
107
  session['current_df'] = df
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  output = io.StringIO()
110
  df.to_csv(output, index=False)
111
  csv_str = output.getvalue()
112
 
113
  return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
114
 
 
115
  def refine_labels(feedback_input):
116
  if session['current_df'] is None:
117
  return "No data found. Please run initial analysis first.", "", ""
@@ -135,7 +194,8 @@ Instructions:
135
  Return only the revised labels, one per line.
136
  """
137
 
138
- response = query_falcon(prompt)
 
139
  new_labels = response.strip().split('\n')[:len(df)]
140
 
141
  df['label'] = new_labels[:len(df)]
@@ -147,28 +207,10 @@ Return only the revised labels, one per line.
147
 
148
  return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
149
 
150
- import gradio as gr
151
- import pandas as pd
152
-
153
- # === Placeholder Functions (replace with actual logic) ===
154
- def run_initial_analysis(file, context, n_clusters):
155
- try:
156
- df = pd.read_csv(file.name, encoding='latin1')
157
- preview = df.head(10).to_csv(index=False)
158
- # Dummy status and download file path (replace with real logic)
159
- status = f"Successfully processed {len(df)} rows into {n_clusters} topics."
160
- return status, file, preview
161
- except Exception as e:
162
- return f"Error during initial analysis: {e}", None, ""
163
-
164
- def refine_labels(feedback):
165
- # Dummy response (replace with real logic)
166
- return f"Refined using feedback: {feedback}", None, ""
167
-
168
  # === GRADIO UI ===
169
- with gr.Blocks(title="Falcon Topic Modeling") as demo:
170
- gr.Markdown("# 🎯 Falcon-Powered Topic Modeling")
171
- gr.Markdown("Upload verbatims, get topics, and refine iteratively.")
172
 
173
  with gr.Row():
174
  with gr.Column():
@@ -189,4 +231,4 @@ with gr.Blocks(title="Falcon Topic Modeling") as demo:
189
  refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
190
 
191
  if __name__ == "__main__":
192
- demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
 
3
  from sklearn.cluster import KMeans
4
  from sentence_transformers import SentenceTransformer
5
+ from keybert import KeyBERT
6
+ import numpy as np
7
  import os
8
  import io
9
+ from crewai import Agent, Task, Crew
10
+ from langchain_community.llms import HuggingFaceHub
11
 
12
  # === CONFIGURATION ===
13
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN") # Set this in environment
14
+ MODEL_NAME = "meta-llama/Llama-3-8b-chat-hf" # Also supports Mistral, Qwen, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Setup LLM via HuggingFace Hub
17
+ llm = HuggingFaceHub(
18
+ repo_id=MODEL_NAME,
19
+ huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
20
+ model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
21
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
24
+ keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")
25
 
 
26
  session = {
27
  "original_df": None,
28
  "current_df": None,
29
  "context": "",
30
+ "topic_labels": {},
31
+ "keywords": {},
32
+ "clusters_verified": False
33
  }
34
 
35
+ # === AGENTS ===
36
+ keyword_agent = Agent(
37
+ role='Keyword Analyst',
38
+ goal='Extract top 5 keywords from a group of similar texts',
39
+ backstory="""You are a skilled keyword analyst who identifies patterns in text data.
40
+ You focus on extracting concise, meaningful keywords that represent the core themes.""",
41
+ llm=llm,
42
+ verbose=False
43
+ )
44
+
45
+ labeling_agent = Agent(
46
+ role='Topic Labeler',
47
+ goal='Generate a short label for a group of similar texts based on context',
48
+ backstory="""You are a professional theme summarizer. Given example texts and a user context,
49
+ you generate clear and actionable topic labels.""",
50
+ llm=llm,
51
+ verbose=False
52
+ )
53
+
54
+ validation_agent = Agent(
55
+ role='QA Analyst',
56
+ goal='Evaluate whether the clustered topics and keywords form coherent themes',
57
+ backstory="""You are a quality assurance expert evaluating if generated topics make sense.
58
+ You return 'Approved' or 'Needs Refinement' based on coherence.""",
59
+ llm=llm,
60
+ verbose=False
61
+ )
62
+
63
+ finalizer_agent = Agent(
64
+ role='Data Engineer',
65
+ goal='Prepare final labeled dataset for download',
66
+ backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
67
+ llm=llm,
68
+ verbose=False
69
+ )
70
+
71
+ # === TASKS ===
72
+ def create_tasks(text_samples, context_input):
73
+ extract_keywords_task = Task(
74
+ description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
75
+ agent=keyword_agent,
76
+ expected_output="Comma-separated list of keywords"
77
+ )
78
+
79
+ label_topic_task = Task(
80
+ description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
81
+ agent=labeling_agent,
82
+ expected_output="A single line topic label"
83
+ )
84
+
85
+ validate_cluster_task = Task(
86
+ description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
87
+ agent=validation_agent,
88
+ expected_output="'Approved' or 'Needs Refinement'"
89
+ )
90
+
91
+ finalize_data_task = Task(
92
+ description="Take the approved labeled DataFrame and format it for download.",
93
+ agent=finalizer_agent,
94
+ expected_output="Final CSV content as string"
95
+ )
96
+
97
+ return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task
98
+
99
+ # === CLUSTERING ===
100
+ def cluster_texts(texts, n_clusters=10):
101
+ embeddings = embedding_model.encode(texts, show_progress_bar=False)
102
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
103
+ return kmeans.fit_predict(embeddings)
104
 
105
+ # === FULL PIPELINE FUNCTION ===
106
  def run_initial_analysis(csv_file, context_input, n_clusters=10):
107
  try:
108
+ df = pd.read_csv(csv_file.name)
109
  except Exception as e:
110
  return f"Error reading CSV: {str(e)}", "", ""
111
 
 
 
 
112
  session['original_df'] = df.copy()
113
  session['context'] = context_input
114
 
115
+ if 'text' not in df.columns:
116
+ return "CSV must contain a column named 'text'", "", ""
117
+
118
  texts = df['text'].tolist()
119
  clusters = cluster_texts(texts, n_clusters)
120
  df['cluster'] = clusters
121
 
122
+ topic_labels = {}
123
+ keywords_map = {}
124
+
125
+ for i in range(n_clusters):
126
+ cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
127
+ if not cluster_texts_i:
128
+ continue
129
+
130
+ samples = "\n".join(cluster_texts_i[:3])
131
+
132
+ # Create CrewAI Tasks for this cluster
133
+ ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)
134
+
135
+ # Run keyword extraction
136
+ crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
137
+ keyword_result = crew_keyword.kickoff()
138
+ keywords_map[i] = keyword_result.raw.strip()
139
+
140
+ # Run labeling
141
+ crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
142
+ label_result = crew_label.kickoff()
143
+ topic_labels[i] = label_result.raw.strip()
144
+
145
+ # Assign labels and keywords back to DataFrame
146
  df['label'] = df['cluster'].map(topic_labels)
147
+ df['keywords'] = df['cluster'].map(keywords_map)
148
 
149
  session['current_df'] = df
150
 
151
+ # Validate Clusters
152
+ validation_prompts = []
153
+ for cid in topic_labels:
154
+ val_task = Task(
155
+ description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
156
+ agent=validation_agent,
157
+ expected_output="'Approved' or 'Needs Refinement'"
158
+ )
159
+ crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
160
+ res = crew_validate.kickoff()
161
+ if "Needs" in res.raw:
162
+ session["clusters_verified"] = False
163
+ break
164
+ else:
165
+ session["clusters_verified"] = True
166
+
167
  output = io.StringIO()
168
  df.to_csv(output, index=False)
169
  csv_str = output.getvalue()
170
 
171
  return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
172
 
173
+ # === REFINEMENT FUNCTION ===
174
  def refine_labels(feedback_input):
175
  if session['current_df'] is None:
176
  return "No data found. Please run initial analysis first.", "", ""
 
194
  Return only the revised labels, one per line.
195
  """
196
 
197
+ # Simulating refinement using the same LLM
198
+ response = llm(prompt)
199
  new_labels = response.strip().split('\n')[:len(df)]
200
 
201
  df['label'] = new_labels[:len(df)]
 
207
 
208
  return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # === GRADIO UI ===
211
+ with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
212
+ gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
213
+ gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")
214
 
215
  with gr.Row():
216
  with gr.Column():
 
231
  refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
232
 
233
  if __name__ == "__main__":
234
+ demo.launch()