Spaces:

asoni9
/

canHeal

Sleeping

App Files Files Community

Anirudha Soni commited on Sep 29, 2025

Commit

597fb81

1 Parent(s): fbc59bf

Basic changes

Browse files

Files changed (3) hide show

app.py +36 -22
preprocess.py +19 -36
retriever.py +13 -50

app.py CHANGED Viewed

@@ -11,24 +11,28 @@ app.layout = dbc.Container(
         html.H1("Toolkit Document Search", className="mb-4"),
         dbc.Row([
             dbc.Col([
-                dcc.Input(id="query-input", type="text",
-                          placeholder="Type your question...", debounce=True,
-                          style={"width": "100%", "padding": "10px"}),
                 html.Br(), html.Br(),
-                dbc.Button("Search", id="search-btn", color="primary")
-            ], md=8)
         ]),
         html.Hr(),
-        html.Div(id="results-area")
     ],
-    fluid=True
 )
 @app.callback(
     Output("results-area", "children"),
     Input("search-btn", "n_clicks"),
     State("query-input", "value"),
-    prevent_initial_call=True
 )
 def search_callback(_, query):
     if not query:
@@ -42,7 +46,11 @@ def search_callback(_, query):
     for idx, r in enumerate(results):
         meta = r["metadata"]
         title = meta.get("name") or f"Document {meta.get('document_id')}"
-        subtitle = f"Created: {meta.get('create_date') or 'N/A'} | Published: {meta.get('publish_date') or 'N/A'} | Categories: {', '.join(meta.get('categories') or [])}"
         cards.append(
             dbc.Card(
                 [
@@ -55,26 +63,32 @@ def search_callback(_, query):
                                 id={"type": "collapse-btn", "index": idx},
                                 color="link",
                                 n_clicks=0,
-                                style={"float": "right"}
                             ),
-                            html.Div(f"Score: {r['score']:.3f}", style={"float": "right", "marginRight": "1em"})
                         ],
-                        style={"display": "flex", "flexDirection": "column"}
-                    ),
-                    dbc.CardBody(
-                        html.P(r["snippet"], style={"fontStyle": "italic"})
                     ),
                     dbc.Collapse(
                         dbc.CardBody(
-                            html.Pre(r["full_text"], style={"whiteSpace": "pre-wrap",
-                                                            "maxHeight": "300px",
-                                                            "overflowY": "auto"})
                         ),
                         id={"type": "collapse", "index": idx},
-                        is_open=False
                     ),
                 ],
-                className="mb-3"
             )
         )
     return cards
@@ -82,7 +96,7 @@ def search_callback(_, query):
 @app.callback(
     Output({"type": "collapse", "index": MATCH}, "is_open"),
     Input({"type": "collapse-btn", "index": MATCH}, "n_clicks"),
-    State({"type": "collapse", "index": MATCH}, "is_open")
 )
 def toggle_collapse(n, is_open):
     if n:
@@ -92,4 +106,4 @@ def toggle_collapse(n, is_open):
 if __name__ == "__main__":
     import os
     port = int(os.environ.get("PORT", 7860))
-    app.run_server(host="0.0.0.0", port=port, debug=False)

         html.H1("Toolkit Document Search", className="mb-4"),
         dbc.Row([
             dbc.Col([
+                dcc.Input(
+                    id="query-input",
+                    type="text",
+                    placeholder="Type your question...",
+                    debounce=True,
+                    style={"width": "100%", "padding": "10px"},
+                ),
                 html.Br(), html.Br(),
+                dbc.Button("Search", id="search-btn", color="primary"),
+            ], md=8),
         ]),
         html.Hr(),
+        html.Div(id="results-area"),
     ],
+    fluid=True,
 )
 @app.callback(
     Output("results-area", "children"),
     Input("search-btn", "n_clicks"),
     State("query-input", "value"),
+    prevent_initial_call=True,
 )
 def search_callback(_, query):
     if not query:
     for idx, r in enumerate(results):
         meta = r["metadata"]
         title = meta.get("name") or f"Document {meta.get('document_id')}"
+        subtitle = (
+            f"Created: {meta.get('create_date') or 'N/A'} | "
+            f"Published: {meta.get('publish_date') or 'N/A'} | "
+            f"Categories: {', '.join(meta.get('categories') or [])}"
+        )
         cards.append(
             dbc.Card(
                 [
                                 id={"type": "collapse-btn", "index": idx},
                                 color="link",
                                 n_clicks=0,
+                                style={"float": "right"},
+                            ),
+                            html.Div(
+                                f"Score: {r['score']:.3f}",
+                                style={"float": "right", "marginRight": "1em"},
                             ),
                         ],
+                        style={"display": "flex", "flexDirection": "column"},
                     ),
+                    dbc.CardBody(html.P(r["snippet"], style={"fontStyle": "italic"})),
                     dbc.Collapse(
                         dbc.CardBody(
+                            html.Pre(
+                                r["full_text"],
+                                style={
+                                    "whiteSpace": "pre-wrap",
+                                    "maxHeight": "300px",
+                                    "overflowY": "auto",
+                                },
+                            )
                         ),
                         id={"type": "collapse", "index": idx},
+                        is_open=False,
                     ),
                 ],
+                className="mb-3",
             )
         )
     return cards
 @app.callback(
     Output({"type": "collapse", "index": MATCH}, "is_open"),
     Input({"type": "collapse-btn", "index": MATCH}, "n_clicks"),
+    State({"type": "collapse", "index": MATCH}, "is_open"),
 )
 def toggle_collapse(n, is_open):
     if n:
 if __name__ == "__main__":
     import os
     port = int(os.environ.get("PORT", 7860))
+    app.run_server(host="0.0.0.0", port=port, debug=False)

preprocess.py CHANGED Viewed

@@ -1,60 +1,43 @@
-import json, re, pickle
-import numpy as np
-from sentence_transformers import SentenceTransformer
 from pathlib import Path
 DATA_DIR = Path("data")
 def load_json(filename):
-    with open(DATA_DIR/filename, "r", encoding="utf-8") as f:
         data = json.load(f)
     if isinstance(data, dict) and "results" in data:
         return data["results"]
     return data if isinstance(data, list) else []
 def extract_text(item):
     texts = []
-    if isinstance(item, dict):
-        for k in ("text", "description", "body", "content", "name"):
-            if k in item and item[k]:
-                texts.append(str(item[k]))
-        if "content_json" in item and isinstance(item["content_json"], dict):
-            for v in item["content_json"].values():
-                if isinstance(v, str) and v.strip():
-                    texts.append(v)
     return texts
 def chunk_text(text, max_words=80):
     sentences = re.split(r'(?<=[.!?]) +', text)
     chunks, cur, count = [], [], 0
     for s in sentences:
         words = s.split()
-        if len(words) < 5: continue
         if count + len(words) > max_words and cur:
             chunks.append(" ".join(cur))
             cur, count = [s], len(words)
         else:
-            cur.append(s); count += len(words)
-    if cur: chunks.append(" ".join(cur))
     return chunks
-print("🔄 Loading JSON...")
-content = load_json("Toolkit_Content_results.json")
-resources = load_json("Toolkit_Resources_results.json")
-docs = []
-for item in content + resources:
-    for t in extract_text(item):
-        docs.extend(chunk_text(t))
-print(f"✅ Loaded {len(docs)} chunks")
-print("🔄 Encoding with SentenceTransformer...")
-model = SentenceTransformer("all-MiniLM-L6-v2")
-embeddings = model.encode(docs, convert_to_numpy=True, normalize_embeddings=True)
-# Save
-print("💾 Saving artifacts...")
-np.save(DATA_DIR/"embeddings.npy", embeddings)
-with open(DATA_DIR/"docs.pkl", "wb") as f:
-    pickle.dump(docs, f)
-print("✅ Done!")

+import json
+import re
 from pathlib import Path
 DATA_DIR = Path("data")
 def load_json(filename):
+    """Load a JSON file and return list of records."""
+    with open(DATA_DIR / filename, "r", encoding="utf-8") as f:
         data = json.load(f)
     if isinstance(data, dict) and "results" in data:
         return data["results"]
     return data if isinstance(data, list) else []
 def extract_text(item):
+    """Extract textual fields from a JSON record."""
     texts = []
+    for k in ("text", "description", "body", "content", "name"):
+        if k in item and item[k]:
+            texts.append(str(item[k]))
+    if "content_json" in item and isinstance(item["content_json"], dict):
+        for v in item["content_json"].values():
+            if isinstance(v, str) and v.strip():
+                texts.append(v)
     return texts
 def chunk_text(text, max_words=80):
+    """Split long text into smaller chunks."""
     sentences = re.split(r'(?<=[.!?]) +', text)
     chunks, cur, count = [], [], 0
     for s in sentences:
         words = s.split()
+        if len(words) < 5:
+            continue
         if count + len(words) > max_words and cur:
             chunks.append(" ".join(cur))
             cur, count = [s], len(words)
         else:
+            cur.append(s)
+            count += len(words)
+    if cur:
+        chunks.append(" ".join(cur))
     return chunks

retriever.py CHANGED Viewed

@@ -4,6 +4,9 @@ import numpy as np
 from sentence_transformers import SentenceTransformer
 from collections import defaultdict
 DATA_DIR = Path("data")
 DATA_DIR.mkdir(exist_ok=True)
 EMB_FILE = DATA_DIR / "embeddings.npy"
@@ -13,75 +16,35 @@ META_FILE = DATA_DIR / "metadata.pkl"       # metadata for each doc
 CONTENT_FILE = DATA_DIR / "Toolkit_Content_results.json"
 RESOURCES_FILE = DATA_DIR / "Toolkit_Resources_results.json"
 model = SentenceTransformer("all-MiniLM-L6-v2")
-# ---------- Helpers ----------
-def _load_json(filename):
-    with open(filename, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    if isinstance(data, dict) and "results" in data:
-        return data["results"]
-    return data if isinstance(data, list) else []
-def _extract_text(item):
-    texts = []
-    for k in ("text", "description", "body", "content", "name"):
-        if k in item and item[k]:
-            texts.append(str(item[k]))
-    if "content_json" in item and isinstance(item["content_json"], dict):
-        for v in item["content_json"].values():
-            if isinstance(v, str) and v.strip():
-                texts.append(v)
-    return texts
-def _chunk_text(text, max_words=80):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks, cur, count = [], [], 0
-    for s in sentences:
-        words = s.split()
-        if len(words) < 5:
-            continue
-        if count + len(words) > max_words and cur:
-            chunks.append(" ".join(cur))
-            cur, count = [s], len(words)
-        else:
-            cur.append(s)
-            count += len(words)
-    if cur:
-        chunks.append(" ".join(cur))
-    return chunks
 def _build_index():
     print("🔄 Building index...")
-    content = _load_json(CONTENT_FILE)
-    resources = _load_json(RESOURCES_FILE)
-    chunks = []
-    chunk_to_doc_idx = []
-    documents = []
-    metadata = []   # will store dict with name, id, dates
     for item in content + resources:
-        # Combine all text for embeddings
-        full_text = "\n".join(_extract_text(item))
         if not full_text.strip():
             continue
         doc_idx = len(documents)
         documents.append(full_text)
-        # --- Metadata ---
         meta = {
             "document_id": item.get("document_id"),
             "name": item.get("name"),
             "create_date": item.get("create_date"),
             "publish_date": item.get("publish_date"),
-            "categories": item.get("categories")
         }
         metadata.append(meta)
-        # --- Chunking ---
-        for ch in _chunk_text(full_text):
             chunks.append(ch)
             chunk_to_doc_idx.append(doc_idx)
@@ -110,14 +73,14 @@ def _load_or_build():
         metadata = pickle.load(f)
     return chunks, chunk_to_doc_idx, documents, metadata, embeddings
 chunks, chunk_to_doc_idx, documents, metadata, embeddings = _load_or_build()
-# ---------- Retrieval ----------
 def retrieve(query, top_k=5):
     q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores = (embeddings @ q_emb.T).squeeze()
-    # Aggregate: pick the max scoring chunk per document
     doc_best = defaultdict(lambda: (-np.inf, None))  # (score, best_snippet)
     for idx, sc in enumerate(scores):
         doc_id = chunk_to_doc_idx[idx]

 from sentence_transformers import SentenceTransformer
 from collections import defaultdict
+from preprocess import load_json, extract_text, chunk_text
+# --------- Paths ---------
 DATA_DIR = Path("data")
 DATA_DIR.mkdir(exist_ok=True)
 EMB_FILE = DATA_DIR / "embeddings.npy"
 CONTENT_FILE = DATA_DIR / "Toolkit_Content_results.json"
 RESOURCES_FILE = DATA_DIR / "Toolkit_Resources_results.json"
+# Embedding model
 model = SentenceTransformer("all-MiniLM-L6-v2")
+# --------- Build Index ---------
 def _build_index():
     print("🔄 Building index...")
+    content = load_json("Toolkit_Content_results.json")
+    resources = load_json("Toolkit_Resources_results.json")
+    chunks, chunk_to_doc_idx, documents, metadata = [], [], [], []
     for item in content + resources:
+        full_text = "\n".join(extract_text(item))
         if not full_text.strip():
             continue
         doc_idx = len(documents)
         documents.append(full_text)
         meta = {
             "document_id": item.get("document_id"),
             "name": item.get("name"),
             "create_date": item.get("create_date"),
             "publish_date": item.get("publish_date"),
+            "categories": item.get("categories"),
         }
         metadata.append(meta)
+        for ch in chunk_text(full_text):
             chunks.append(ch)
             chunk_to_doc_idx.append(doc_idx)
         metadata = pickle.load(f)
     return chunks, chunk_to_doc_idx, documents, metadata, embeddings
+# Load on import
 chunks, chunk_to_doc_idx, documents, metadata, embeddings = _load_or_build()
+# --------- Retrieval ---------
 def retrieve(query, top_k=5):
     q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores = (embeddings @ q_emb.T).squeeze()
     doc_best = defaultdict(lambda: (-np.inf, None))  # (score, best_snippet)
     for idx, sc in enumerate(scores):
         doc_id = chunk_to_doc_idx[idx]