Spaces:

cjc0013
/

epstein-semantic-explorer

Sleeping

App Files Files Community

cjc0013 commited on Nov 15, 2025

Commit

f2ffec2

verified ·

1 Parent(s): 2bf9d37

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -51

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import json, re, math, os
 from collections import Counter, defaultdict
 # ===============================================================
 # UTILITIES
 # ===============================================================
@@ -33,23 +32,44 @@ def cosine(a, b):
         return 0
     return num / math.sqrt(da*db)
 # ===============================================================
-# MAIN LOAD FUNCTION
 # ===============================================================
-def load_jsonl(jsonl_file):
-    if jsonl_file is None:
-        return None, None, "⚠ No file uploaded."
     records = []
-    with open(jsonl_file.name, "r", encoding="utf8") as f:
         for line in f:
             try:
                 records.append(json.loads(line))
             except:
                 pass
     cluster_map = defaultdict(list)
     for r in records:
         cluster_map[r.get("cluster", -1)].append(r)
@@ -63,7 +83,7 @@ def load_jsonl(jsonl_file):
             doc_freq[t] += 1
     Ndocs = len(records)
-    avg_len = sum(len(t) for t in tokenized_docs) / Ndocs
     centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
@@ -86,24 +106,27 @@ def bm25_score(query, doc_toks, doc_freq, Ndocs, avg_len):
     k=1.5; b=0.75
     score = 0
     q_toks = tokenize(query)
     for q in q_toks:
-        df = doc_freq.get(q,0)
         if df == 0:
             continue
-        idf = math.log((Ndocs - df + 0.5)/(df + 0.5) + 1)
         tf = doc_toks.count(q)
-        denom = tf + k*(1 - b + b*(len(doc_toks)/avg_len))
-        score += idf*(tf*(k+1))/denom
     return score
 # ===============================================================
-# GRADIO INTERFACE FUNCTIONS
 # ===============================================================
 def do_view_cluster(state, cid):
     if state is None:
-        return "⚠ Upload a file first."
     try:
         cid = int(cid)
     except:
@@ -114,42 +137,36 @@ def do_view_cluster(state, cid):
     if cid not in cluster_map:
         return "❌ Cluster not found."
     out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
-    for d in cluster_map[cid][:20]:
-        t = d["text"].strip()
-        if len(t) > 1200:
-            t = t[:1200] + " … [truncated]"
-        out.append(f"\n--- id={d.get('id')} ---\n{t}\n")
     return "\n".join(out)
 def do_search(state, query):
     if state is None:
-        return "⚠ Upload a file first."
-    records = state["records"]
-    tokenized_docs = state["tokenized_docs"]
-    doc_freq = state["doc_freq"]
-    Ndocs = state["Ndocs"]
-    avg_len = state["avg_len"]
-    scores = []
-    for r, toks in zip(records, tokenized_docs):
-        s = bm25_score(query, toks, doc_freq, Ndocs, avg_len)
-        if s > 0:
-            scores.append((s, r))
-    scores.sort(reverse=True, key=lambda x: x[0])
     out = [f"=== Results for '{query}' ==="]
-    for s, r in scores[:15]:
-        out.append(f"\nScore {s:.2f} — Cluster {r['cluster']} — id={r['id']}\n{r['text'][:500]}…\n")
     return "\n".join(out)
 def do_show_topics(state):
     if state is None:
-        return "⚠ Upload a file first."
     STOPWORDS = set("""
 the and to of a in is this that for on with as be or by from at
@@ -158,10 +175,11 @@ subject re fw message thereof all may any doc email
 """.split())
     out = ["=== Cluster Topics ==="]
     for cid, cent in state["centroids"].items():
-        filtered = {w:c for w,c in cent.items()
                     if w not in STOPWORDS and len(w) > 2 and c > 1}
-        top = [w for w,_ in Counter(filtered).most_common(6)]
         out.append(f"Cluster {cid:<4} | {' '.join(top)}")
     return "\n".join(out)
@@ -169,12 +187,10 @@ subject re fw message thereof all may any doc email
 def do_entity_search(state, name):
     if state is None:
-        return "⚠ Upload a file first."
     hits = []
-    cluster_map = state["cluster_map"]
-    for cid, docs in cluster_map.items():
         count = sum(name.lower() in d["text"].lower() for d in docs)
         if count:
             hits.append((count, cid))
@@ -182,31 +198,45 @@ def do_entity_search(state, name):
     hits.sort(reverse=True)
     out = [f"=== Clusters mentioning '{name}' ==="]
-    for count, cid in hits[:20]:
         out.append(f"Cluster {cid}: {count} hits")
     return "\n".join(out)
 # ===============================================================
 # GRADIO UI
 # ===============================================================
 with gr.Blocks(title="Epstein Semantic Explorer") as demo:
-    gr.Markdown("""
-# Epstein Semantic Explorer
-Upload your `epstein_semantic.jsonl` file to begin.
-""")
     with gr.Row():
-        jsonl_file = gr.File(label="Upload JSONL dataset")
         load_btn = gr.Button("Load Dataset")
-    state = gr.State()
-    clusters_box = gr.Number(label="Cluster # to View", value=96)
-    query_box = gr.Textbox(label="Search Keyword")
-    entity_box = gr.Textbox(label="Search for Entity (name)")
-    output = gr.Textbox(label="Output", lines=30)
     load_btn.click(load_jsonl, inputs=[jsonl_file], outputs=[state, clusters_box, output])
     clusters_box.change(do_view_cluster, inputs=[state, clusters_box], outputs=output)

 import json, re, math, os
 from collections import Counter, defaultdict
 # ===============================================================
 # UTILITIES
 # ===============================================================
         return 0
     return num / math.sqrt(da*db)
 # ===============================================================
+# LOAD JSONL FROM FILE
 # ===============================================================
+def load_records_from_path(path):
+    """Loads a dataset from an existing file, used at startup."""
+    if not os.path.exists(path):
+        return None, None, "⚠ JSONL file not found."
     records = []
+    with open(path, "r", encoding="utf8") as f:
         for line in f:
             try:
                 records.append(json.loads(line))
             except:
                 pass
+    return initialize_state(records)
+def load_jsonl(user_file):
+    """Loads a dataset from user upload."""
+    if user_file is None:
+        return gr.update(), None, "⚠ No file uploaded."
+    records = []
+    with open(user_file.name, "r", encoding="utf8") as f:
+        for line in f:
+            try:
+                records.append(json.loads(line))
+            except:
+                pass
+    return initialize_state(records)
+def initialize_state(records):
+    """Builds all indexes for search, clustering, etc."""
     cluster_map = defaultdict(list)
     for r in records:
         cluster_map[r.get("cluster", -1)].append(r)
             doc_freq[t] += 1
     Ndocs = len(records)
+    avg_len = sum(len(t) for t in tokenized_docs) / max(Ndocs, 1)
     centroids = {cid: centroid(docs) for cid, docs in cluster_map.items()}
     k=1.5; b=0.75
     score = 0
     q_toks = tokenize(query)
     for q in q_toks:
+        df = doc_freq.get(q, 0)
         if df == 0:
             continue
+        idf = math.log((Ndocs - df + 0.5) / (df + 0.5) + 1)
         tf = doc_toks.count(q)
+        denom = tf + k * (1 - b + b * (len(doc_toks) / avg_len))
+        score += idf * (tf * (k + 1)) / denom
     return score
 # ===============================================================
+# GRADIO FEATURE FUNCTIONS
 # ===============================================================
 def do_view_cluster(state, cid):
     if state is None:
+        return "⚠ No dataset loaded."
     try:
         cid = int(cid)
     except:
     if cid not in cluster_map:
         return "❌ Cluster not found."
+    # FULL TEXT (NO MORE TRUNCATION)
     out = [f"=== Cluster {cid} ({len(cluster_map[cid])} docs) ===\n"]
+    for d in cluster_map[cid]:
+        out.append(f"\n--- id={d.get('id')} ---\n{d['text']}\n")
     return "\n".join(out)
 def do_search(state, query):
     if state is None:
+        return "⚠ No dataset loaded."
+    results = []
+    for r, toks in zip(state["records"], state["tokenized_docs"]):
+        score = bm25_score(query, toks, state["doc_freq"], state["Ndocs"], state["avg_len"])
+        if score > 0:
+            results.append((score, r))
+    results.sort(reverse=True)
     out = [f"=== Results for '{query}' ==="]
+    for score, r in results[:30]:
+        out.append(f"\nScore {score:.2f} — Cluster {r['cluster']} — id={r['id']}\n{r['text']}\n")
     return "\n".join(out)
 def do_show_topics(state):
     if state is None:
+        return "⚠ No dataset loaded."
     STOPWORDS = set("""
 the and to of a in is this that for on with as be or by from at
 """.split())
     out = ["=== Cluster Topics ==="]
     for cid, cent in state["centroids"].items():
+        filtered = {w: c for w, c in cent.items()
                     if w not in STOPWORDS and len(w) > 2 and c > 1}
+        top = [w for w, _ in Counter(filtered).most_common(10)]
         out.append(f"Cluster {cid:<4} | {' '.join(top)}")
     return "\n".join(out)
 def do_entity_search(state, name):
     if state is None:
+        return "⚠ No dataset loaded."
     hits = []
+    for cid, docs in state["cluster_map"].items():
         count = sum(name.lower() in d["text"].lower() for d in docs)
         if count:
             hits.append((count, cid))
     hits.sort(reverse=True)
     out = [f"=== Clusters mentioning '{name}' ==="]
+    for count, cid in hits[:30]:
         out.append(f"Cluster {cid}: {count} hits")
     return "\n".join(out)
+# ===============================================================
+# AUTO-LOAD DATASET IF PRESENT
+# ===============================================================
+DEFAULT_PATH = "epstein_semantic.jsonl"
+startup_state = None
+startup_clusters = None
+startup_msg = "⚠ No default dataset found."
+if os.path.exists(DEFAULT_PATH):
+    startup_state, startup_clusters, startup_msg = load_records_from_path(DEFAULT_PATH)
 # ===============================================================
 # GRADIO UI
 # ===============================================================
 with gr.Blocks(title="Epstein Semantic Explorer") as demo:
+    gr.Markdown("# Epstein Semantic Explorer")
+    gr.Markdown(startup_msg)
     with gr.Row():
+        jsonl_file = gr.File(label="Upload different JSONL dataset")
         load_btn = gr.Button("Load Dataset")
+    state = gr.State(startup_state)
+    clusters_box = gr.Number(label="Cluster #", value=96)
+    query_box = gr.Textbox(label="Keyword Search")
+    entity_box = gr.Textbox(label="Entity Search (name)")
+    output = gr.Textbox(label="Output", lines=40)
     load_btn.click(load_jsonl, inputs=[jsonl_file], outputs=[state, clusters_box, output])
     clusters_box.change(do_view_cluster, inputs=[state, clusters_box], outputs=output)