Alimubariz124 commited on
Commit
c2fded3
·
verified ·
1 Parent(s): 9314665

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.cluster import KMeans
5
+ from sentence_transformers import SentenceTransformer
6
+ import requests
7
+ import os
8
+
9
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN") # ✅ GOOD: Read from environment
10
+
11
+ # === CONFIGURATION ===
12
+ #HF_API_TOKEN = ""
13
+ FALCON_MODEL = "tiiuae/falcon-7b-instruct"
14
+
15
+ # === STEP 1: CLUSTERING MODEL ===
16
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
17
+
18
+ def get_embeddings(texts):
19
+ return embedding_model.encode(texts, show_progress_bar=False)
20
+
21
+ def cluster_texts(texts, n_clusters=10):
22
+ embeddings = get_embeddings(texts)
23
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
24
+ clusters = kmeans.fit_predict(embeddings)
25
+ return clusters
26
+
27
+ # === STEP 2: FALCON-BASED LABELING ===
28
+ def query_falcon(prompt):
29
+ headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
30
+ API_URL = f"https://api-inference.huggingface.co/models/{FALCON_MODEL}"
31
+
32
+ payload = {
33
+ "inputs": prompt,
34
+ "parameters": {
35
+ "max_new_tokens": 50,
36
+ "temperature": 0.3,
37
+ "do_sample": True
38
+ }
39
+ }
40
+
41
+ response = requests.post(API_URL, headers=headers, json=payload)
42
+ try:
43
+ return response.json()[0]['generated_text'].strip()
44
+ except Exception as e:
45
+ print(f"Error calling Falcon: {e}")
46
+ return ""
47
+
48
+ def generate_topic_labels(texts, clusters, n_clusters=10):
49
+ cluster_samples = {}
50
+ for i in range(n_clusters):
51
+ samples = [texts[j] for j in range(len(clusters)) if clusters[j] == i][:3]
52
+ cluster_samples[i] = "\n".join(samples)
53
+
54
+ topic_labels = {}
55
+
56
+ for cid, sample_text in cluster_samples.items():
57
+ prompt = f"""
58
+ You are an expert in qualitative analysis.
59
+ Given the following customer feedback examples from one group, describe the overall theme in 1–2 words.
60
+
61
+ EXAMPLES:
62
+ {sample_text}
63
+
64
+ TOPIC LABEL:
65
+ """
66
+ label = query_falcon(prompt)
67
+ topic_labels[cid] = label
68
+
69
+ return topic_labels
70
+
71
+ # === STEP 3: REFINEMENT LOOP UTILS ===
72
+ session = {
73
+ "original_df": None,
74
+ "current_df": None,
75
+ "context": "",
76
+ "topic_labels": {}
77
+ }
78
+
79
+ def run_initial_analysis(csv_file, context_input, n_clusters=10):
80
+ try:
81
+ df = pd.read_csv(csv_file)
82
+ except Exception as e:
83
+ return f"Error reading CSV: {str(e)}", "", ""
84
+
85
+ if 'text' not in df.columns:
86
+ return "CSV must contain a column named 'text'", "", ""
87
+
88
+ session['original_df'] = df.copy()
89
+ session['context'] = context_input
90
+
91
+ texts = df['text'].tolist()
92
+ clusters = cluster_texts(texts, n_clusters)
93
+ df['cluster'] = clusters
94
+
95
+ topic_labels = generate_topic_labels(texts, clusters, n_clusters)
96
+ df['label'] = df['cluster'].map(topic_labels)
97
+
98
+ session['current_df'] = df
99
+ session['topic_labels'] = topic_labels
100
+
101
+ # Save CSV
102
+ output = io.StringIO()
103
+ df.to_csv(output, index=False)
104
+ csv_str = output.getvalue()
105
+
106
+ return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)
107
+
108
+ def refine_labels(feedback_input):
109
+ if session['current_df'] is None:
110
+ return "No data found. Please run initial analysis first.", "", ""
111
+
112
+ df = session['current_df']
113
+ current_sample = df[['text', 'label']].head(10).to_markdown(index=False)
114
+
115
+ prompt = f"""
116
+ You are helping refine topic labels based on user feedback.
117
+
118
+ Current Labels:
119
+ {current_sample}
120
+
121
+ User Feedback:
122
+ {feedback_input}
123
+
124
+ Task:
125
+ Reassign labels accordingly. Keep output format consistent: one label per line.
126
+
127
+ Instructions:
128
+ Return only the revised labels, one per line.
129
+ """
130
+
131
+ response = query_falcon(prompt)
132
+ new_labels = response.strip().split('\n')[:len(df)]
133
+
134
+ df['label'] = new_labels[:len(df)]
135
+ session['current_df'] = df
136
+
137
+ output = io.StringIO()
138
+ df.to_csv(output, index=False)
139
+ csv_str = output.getvalue()
140
+
141
+ return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)
142
+
143
+ # === GRADIO UI ===
144
+ with gr.Blocks(title="Falcon Topic Modeling") as demo:
145
+ gr.Markdown("# 🎯 Falcon-Powered Topic Modeling")
146
+ gr.Markdown("Upload verbatims, get topics, and refine iteratively.")
147
+
148
+ with gr.Row():
149
+ with gr.Column():
150
+ upload = gr.File(label="Upload CSV ('text' column)")
151
+ context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.")
152
+ cluster_slider = gr.Slider(2, 20, value=10, label="Number of Topics")
153
+ run_btn = gr.Button("Run Initial Analysis")
154
+
155
+ with gr.Column():
156
+ feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5)
157
+ refine_btn = gr.Button("Refine Labels")
158
+
159
+ status = gr.Textbox(label="Status")
160
+ preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10)
161
+ download = gr.File(label="Download Final Labeled CSV")
162
+
163
+ run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview])
164
+ refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])
165
+
166
+ if __name__ == "__main__":
167
+ demo.launch()