cjc0013 commited on
Commit
f2ffec2
·
verified ·
1 Parent(s): 2bf9d37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -51
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
5
-
6
  # ===============================================================
7
  # UTILITIES
8
  # ===============================================================
@@ -33,23 +32,44 @@ def cosine(a, b):
33
  return 0
34
  return num / math.sqrt(da*db)
35
 
36
-
37
  # ===============================================================
38
- # MAIN LOAD FUNCTION
39
  # ===============================================================
40
 
41
- def load_jsonl(jsonl_file):
42
- if jsonl_file is None:
43
- return None, None, "⚠ No file uploaded."
 
44
 
45
  records = []
46
- with open(jsonl_file.name, "r", encoding="utf8") as f:
47
  for line in f:
48
  try:
49
  records.append(json.loads(line))
50
  except:
51
  pass
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  cluster_map = defaultdict(list)
54
  for r in records:
55
  cluster_map[r.get("cluster", -1)].append(r)
@@ -63,7 +83,7 @@ def load_jsonl(jsonl_file):
63
  doc_freq[t] += 1
64
 
65
  Ndocs = len(records)
66
- avg_len = sum(len(t) for t in tokenized_docs) / Ndocs
67
 
68
  centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
69
 
@@ -86,24 +106,27 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
86
  k=1.5; b=0.75
87
  score = 0
88
  q_toks = tokenize(query)
 
89
  for q in q_toks:
90
- df = doc_freq.get(q,0)
91
  if df == 0:
92
  continue
93
- idf = math.log((Ndocs - df + 0.5)/(df + 0.5) + 1)
94
  tf = doc_toks.count(q)
95
- denom = tf + k*(1 - b + b*(len(doc_toks)/avg_len))
96
- score += idf*(tf*(k+1))/denom
 
97
  return score
98
 
99
 
100
  # ===============================================================
101
- # GRADIO INTERFACE FUNCTIONS
102
  # ===============================================================
103
 
104
  def do_view_cluster(state, cid):
105
  if state is None:
106
- return "⚠ Upload a file first."
 
107
  try:
108
  cid = int(cid)
109
  except:
@@ -114,42 +137,36 @@ def do_view_cluster(state, cid):
114
  if cid not in cluster_map:
115
  return "❌ Cluster not found."
116
 
 
117
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
118
- for d in cluster_map[cid][:20]:
119
- t = d["text"].strip()
120
- if len(t) > 1200:
121
- t = t[:1200] + " … [truncated]"
122
- out.append(f"\n--- id={d.get('id')} ---\n{t}\n")
123
  return "\n".join(out)
124
 
125
 
126
  def do_search(state, query):
127
  if state is None:
128
- return "⚠ Upload a file first."
129
 
130
- records = state["records"]
131
- tokenized_docs = state["tokenized_docs"]
132
- doc_freq = state["doc_freq"]
133
- Ndocs = state["Ndocs"]
134
- avg_len = state["avg_len"]
135
 
136
- scores = []
137
- for r, toks in zip(records, tokenized_docs):
138
- s = bm25_score(query, toks, doc_freq, Ndocs, avg_len)
139
- if s > 0:
140
- scores.append((s, r))
141
- scores.sort(reverse=True, key=lambda x: x[0])
142
 
143
  out = [f"=== Results for '{query}' ==="]
144
- for s, r in scores[:15]:
145
- out.append(f"\nScore {s:.2f} — Cluster {r['cluster']} — id={r['id']}\n{r['text'][:500]}…\n")
146
 
147
  return "\n".join(out)
148
 
149
 
150
  def do_show_topics(state):
151
  if state is None:
152
- return "⚠ Upload a file first."
153
 
154
  STOPWORDS = set("""
155
  the and to of a in is this that for on with as be or by from at
@@ -158,10 +175,11 @@ subject re fw message thereof all may any doc email
158
  """.split())
159
 
160
  out = ["=== Cluster Topics ==="]
 
161
  for cid, cent in state["centroids"].items():
162
- filtered = {w:c for w,c in cent.items()
163
  if w not in STOPWORDS and len(w) > 2 and c > 1}
164
- top = [w for w,_ in Counter(filtered).most_common(6)]
165
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
166
 
167
  return "\n".join(out)
@@ -169,12 +187,10 @@ subject re fw message thereof all may any doc email
169
 
170
  def do_entity_search(state, name):
171
  if state is None:
172
- return "⚠ Upload a file first."
173
 
174
  hits = []
175
- cluster_map = state["cluster_map"]
176
-
177
- for cid, docs in cluster_map.items():
178
  count = sum(name.lower() in d["text"].lower() for d in docs)
179
  if count:
180
  hits.append((count, cid))
@@ -182,31 +198,45 @@ def do_entity_search(state, name):
182
  hits.sort(reverse=True)
183
 
184
  out = [f"=== Clusters mentioning '{name}' ==="]
185
- for count, cid in hits[:20]:
186
  out.append(f"Cluster {cid}: {count} hits")
 
187
  return "\n".join(out)
188
 
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # ===============================================================
191
  # GRADIO UI
192
  # ===============================================================
193
 
194
  with gr.Blocks(title="Epstein Semantic Explorer") as demo:
195
- gr.Markdown("""
196
- # Epstein Semantic Explorer
197
- Upload your `epstein_semantic.jsonl` file to begin.
198
- """)
199
 
200
  with gr.Row():
201
- jsonl_file = gr.File(label="Upload JSONL dataset")
202
  load_btn = gr.Button("Load Dataset")
203
 
204
- state = gr.State()
205
- clusters_box = gr.Number(label="Cluster # to View", value=96)
206
- query_box = gr.Textbox(label="Search Keyword")
207
- entity_box = gr.Textbox(label="Search for Entity (name)")
208
 
209
- output = gr.Textbox(label="Output", lines=30)
210
 
211
  load_btn.click(load_jsonl, inputs=[jsonl_file], outputs=[state, clusters_box, output])
212
  clusters_box.change(do_view_cluster, inputs=[state, clusters_box], outputs=output)
 
2
  import json, re, math, os
3
  from collections import Counter, defaultdict
4
 
 
5
  # ===============================================================
6
  # UTILITIES
7
  # ===============================================================
 
32
  return 0
33
  return num / math.sqrt(da*db)
34
 
 
35
  # ===============================================================
36
+ # LOAD JSONL FROM FILE
37
  # ===============================================================
38
 
39
+ def load_records_from_path(path):
40
+ """Loads a dataset from an existing file, used at startup."""
41
+ if not os.path.exists(path):
42
+ return None, None, "⚠ JSONL file not found."
43
 
44
  records = []
45
+ with open(path, "r", encoding="utf8") as f:
46
  for line in f:
47
  try:
48
  records.append(json.loads(line))
49
  except:
50
  pass
51
 
52
+ return initialize_state(records)
53
+
54
+
55
+ def load_jsonl(user_file):
56
+ """Loads a dataset from user upload."""
57
+ if user_file is None:
58
+ return gr.update(), None, "⚠ No file uploaded."
59
+
60
+ records = []
61
+ with open(user_file.name, "r", encoding="utf8") as f:
62
+ for line in f:
63
+ try:
64
+ records.append(json.loads(line))
65
+ except:
66
+ pass
67
+
68
+ return initialize_state(records)
69
+
70
+
71
+ def initialize_state(records):
72
+ """Builds all indexes for search, clustering, etc."""
73
  cluster_map = defaultdict(list)
74
  for r in records:
75
  cluster_map[r.get("cluster", -1)].append(r)
 
83
  doc_freq[t] += 1
84
 
85
  Ndocs = len(records)
86
+ avg_len = sum(len(t) for t in tokenized_docs) / max(Ndocs, 1)
87
 
88
  centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
89
 
 
106
  k=1.5; b=0.75
107
  score = 0
108
  q_toks = tokenize(query)
109
+
110
  for q in q_toks:
111
+ df = doc_freq.get(q, 0)
112
  if df == 0:
113
  continue
114
+ idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
115
  tf = doc_toks.count(q)
116
+ denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
117
+ score += idf * (tf * (k + 1)) / denom
118
+
119
  return score
120
 
121
 
122
  # ===============================================================
123
+ # GRADIO FEATURE FUNCTIONS
124
  # ===============================================================
125
 
126
  def do_view_cluster(state, cid):
127
  if state is None:
128
+ return "⚠ No dataset loaded."
129
+
130
  try:
131
  cid = int(cid)
132
  except:
 
137
  if cid not in cluster_map:
138
  return "❌ Cluster not found."
139
 
140
+ # FULL TEXT (NO MORE TRUNCATION)
141
  out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
142
+ for d in cluster_map[cid]:
143
+ out.append(f"\n--- id={d.get('id')} ---\n{d['text']}\n")
144
+
 
 
145
  return "\n".join(out)
146
 
147
 
148
  def do_search(state, query):
149
  if state is None:
150
+ return "⚠ No dataset loaded."
151
 
152
+ results = []
153
+ for r, toks in zip(state["records"], state["tokenized_docs"]):
154
+ score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
155
+ if score > 0:
156
+ results.append((score, r))
157
 
158
+ results.sort(reverse=True)
 
 
 
 
 
159
 
160
  out = [f"=== Results for '{query}' ==="]
161
+ for score, r in results[:30]:
162
+ out.append(f"\nScore {score:.2f} — Cluster {r['cluster']} — id={r['id']}\n{r['text']}\n")
163
 
164
  return "\n".join(out)
165
 
166
 
167
  def do_show_topics(state):
168
  if state is None:
169
+ return "⚠ No dataset loaded."
170
 
171
  STOPWORDS = set("""
172
  the and to of a in is this that for on with as be or by from at
 
175
  """.split())
176
 
177
  out = ["=== Cluster Topics ==="]
178
+
179
  for cid, cent in state["centroids"].items():
180
+ filtered = {w: c for w, c in cent.items()
181
  if w not in STOPWORDS and len(w) > 2 and c > 1}
182
+ top = [w for w, _ in Counter(filtered).most_common(10)]
183
  out.append(f"Cluster {cid:<4} | {' '.join(top)}")
184
 
185
  return "\n".join(out)
 
187
 
188
  def do_entity_search(state, name):
189
  if state is None:
190
+ return "⚠ No dataset loaded."
191
 
192
  hits = []
193
+ for cid, docs in state["cluster_map"].items():
 
 
194
  count = sum(name.lower() in d["text"].lower() for d in docs)
195
  if count:
196
  hits.append((count, cid))
 
198
  hits.sort(reverse=True)
199
 
200
  out = [f"=== Clusters mentioning '{name}' ==="]
201
+ for count, cid in hits[:30]:
202
  out.append(f"Cluster {cid}: {count} hits")
203
+
204
  return "\n".join(out)
205
 
206
 
207
+ # ===============================================================
208
+ # AUTO-LOAD DATASET IF PRESENT
209
+ # ===============================================================
210
+
211
+ DEFAULT_PATH = "epstein_semantic.jsonl"
212
+
213
+ startup_state = None
214
+ startup_clusters = None
215
+ startup_msg = "⚠ No default dataset found."
216
+
217
+ if os.path.exists(DEFAULT_PATH):
218
+ startup_state, startup_clusters, startup_msg = load_records_from_path(DEFAULT_PATH)
219
+
220
+
221
  # ===============================================================
222
  # GRADIO UI
223
  # ===============================================================
224
 
225
  with gr.Blocks(title="Epstein Semantic Explorer") as demo:
226
+
227
+ gr.Markdown("# Epstein Semantic Explorer")
228
+ gr.Markdown(startup_msg)
 
229
 
230
  with gr.Row():
231
+ jsonl_file = gr.File(label="Upload different JSONL dataset")
232
  load_btn = gr.Button("Load Dataset")
233
 
234
+ state = gr.State(startup_state)
235
+ clusters_box = gr.Number(label="Cluster #", value=96)
236
+ query_box = gr.Textbox(label="Keyword Search")
237
+ entity_box = gr.Textbox(label="Entity Search (name)")
238
 
239
+ output = gr.Textbox(label="Output", lines=40)
240
 
241
  load_btn.click(load_jsonl, inputs=[jsonl_file], outputs=[state, clusters_box, output])
242
  clusters_box.change(do_view_cluster, inputs=[state, clusters_box], outputs=output)