Spaces:

kamp0010
/

cc1

Runtime error

App Files Files Community

kamp0010 commited on 6 days ago

Commit

5a2fe7a

verified ·

1 Parent(s): 5379d0c

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -101

app.py CHANGED Viewed

@@ -1,39 +1,49 @@
 import os
 import builtins
-# ── MUST happen before ANY other import ───────────────────────────────────────
-# transformers calls builtins.input() for the "Do you wish to run the custom
-# code? [y/N]" prompt.  Patch it to always answer "y" silently.
 _real_input = builtins.input
 def _auto_yes(prompt=""):
-    if "custom code" in str(prompt).lower() or "trust" in str(prompt).lower():
         return "y"
     return _real_input(prompt)
 builtins.input = _auto_yes
-os.environ["TRUST_REMOTE_CODE"]            = "1"
-os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"]       = "false"
-os.environ["HF_HUB_VERBOSITY"]            = "error"
 import streamlit as st
 import numpy as np
 import re
 from transformers import AutoModel
-# Belt-and-suspenders: patch the internal resolver too, after import
 try:
     import transformers.dynamic_module_utils as _dmu
-    _dmu.resolve_trust_remote_code = lambda *a, **kw: True  # type: ignore
 except Exception:
     pass
-# ─────────────────────────── Page config ────────────────────────────
-st.set_page_config(
-    page_title="pplx-embed Semantic Search",
-    page_icon="🔍",
-    layout="wide",
-)
 st.title("🔍 Semantic Search with pplx-embed-context-v1")
 st.caption(
@@ -41,88 +51,57 @@ st.caption(
     "Powered by [perplexity-ai/pplx-embed-context-v1-0.6B](https://huggingface.co/perplexity-ai/pplx-embed-context-v1-0.6b)."
 )
-# ─────────────────────────── Model loading ──────────────────────────
 @st.cache_resource(show_spinner="Loading embedding models — this takes ~30 s on first run…")
 def load_models():
-    ctx_model = AutoModel.from_pretrained(
-        "perplexity-ai/pplx-embed-context-v1-0.6B",
-        trust_remote_code=True,
-    )
-    query_model = AutoModel.from_pretrained(
-        "perplexity-ai/pplx-embed-v1-0.6B",
-        trust_remote_code=True,
-    )
     return ctx_model, query_model
 ctx_model, query_model = load_models()
-# ─────────────────────────── Helpers ────────────────────────────────
-def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
-    """Split text into sentence-based chunks with overlap."""
-    # Split into sentences (rough split on . ! ? followed by whitespace)
     sentences = re.split(r'(?<=[.!?])\s+', text.strip())
     sentences = [s.strip() for s in sentences if s.strip()]
-    chunks = []
-    i = 0
     while i < len(sentences):
-        chunk = " ".join(sentences[i : i + chunk_size])
-        chunks.append(chunk)
         i += max(1, chunk_size - overlap)
     return chunks
-def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
-    """Cosine similarity between two 1-D vectors."""
-    norm_a = np.linalg.norm(a)
-    norm_b = np.linalg.norm(b)
-    if norm_a == 0 or norm_b == 0:
-        return 0.0
-    return float(np.dot(a, b) / (norm_a * norm_b))
-def embed_document(chunks: list[str]) -> np.ndarray:
-    """
-    Embed all chunks as ONE document so the context model sees surrounding
-    chunks.  Returns shape (n_chunks, 1024).
-    """
-    embeddings_list = ctx_model.encode([chunks])   # list of 1 numpy array
-    return embeddings_list[0]                      # (n_chunks, 1024)
-def embed_query(query: str) -> np.ndarray:
-    """Embed a single query string.  Returns shape (1024,)."""
-    # query model expects list[str] → returns list of 1-D arrays
-    result = query_model.encode([query])
-    return np.array(result[0]).flatten()
-def search(query: str, chunks: list[str], chunk_embeddings: np.ndarray, top_k: int = 5):
-    """Return top-k chunks ranked by cosine similarity to query."""
-    q_emb = embed_query(query)
-    scores = [cosine_similarity(q_emb, chunk_embeddings[i]) for i in range(len(chunks))]
     ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
     return [(chunks[idx], score) for idx, score in ranked[:top_k]]
-# ─────────────────────────── Sidebar ────────────────────────────────
 with st.sidebar:
     st.header("⚙️ Settings")
-    chunk_size = st.slider("Sentences per chunk", min_value=1, max_value=8, value=3)
-    overlap    = st.slider("Sentence overlap",    min_value=0, max_value=4, value=1)
-    top_k      = st.slider("Results to show",     min_value=1, max_value=10, value=5)
     st.markdown("---")
     st.markdown(
         "**How it works**\n\n"
-        "1. Your file is split into overlapping sentence chunks.\n"
-        "2. All chunks are embedded together as one document using the *context* model "
-        "so each chunk is aware of its neighbours.\n"
-        "3. Your question is embedded with the *query* model.\n"
         "4. Cosine similarity ranks chunks by relevance."
     )
-# ─────────────────────────── File upload ────────────────────────────
 uploaded = st.file_uploader("📄 Upload a document", type=["txt", "md"])
 if uploaded:
@@ -131,50 +110,38 @@ if uploaded:
     with st.expander("📃 Preview document", expanded=False):
         st.text(raw_text[:3000] + ("…" if len(raw_text) > 3000 else ""))
-    # Re-chunk & re-embed whenever the file or settings change
     cache_key = (uploaded.name, uploaded.size, chunk_size, overlap)
     if st.session_state.get("cache_key") != cache_key:
         with st.spinner("Chunking and embedding document…"):
             chunks = chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
             embeddings = embed_document(chunks)
-        st.session_state["cache_key"]   = cache_key
-        st.session_state["chunks"]      = chunks
-        st.session_state["embeddings"]  = embeddings
         st.success(f"✅ Indexed **{len(chunks)}** chunks from *{uploaded.name}*")
     else:
         chunks     = st.session_state["chunks"]
         embeddings = st.session_state["embeddings"]
         st.info(f"✅ Using cached index — **{len(chunks)}** chunks from *{uploaded.name}*")
-    # ─────────────────────────── Query ──────────────────────────────
     st.markdown("---")
     query = st.text_input("💬 Ask a question about the document", placeholder="e.g. What is the main conclusion?")
     if st.button("🔍 Search", disabled=not query.strip()):
-        if query.strip():
-            with st.spinner("Searching…"):
-                results = search(query, chunks, embeddings, top_k=top_k)
-            st.markdown("### 📌 Top Results")
-            for rank, (chunk_text_result, score) in enumerate(results, 1):
-                pct = score * 100
-                color = "#2ecc71" if pct >= 60 else "#f39c12" if pct >= 35 else "#e74c3c"
-                st.markdown(
-                    f"""
-                    <div style="
-                        border-left: 4px solid {color};
-                        padding: 12px 16px;
-                        margin-bottom: 12px;
-                        background: #f8f9fa;
-                        border-radius: 0 8px 8px 0;
-                    ">
-                        <div style="font-size:0.8rem;color:{color};font-weight:600;margin-bottom:6px;">
-                            #{rank} &nbsp;·&nbsp; Similarity: {pct:.1f}%
-                        </div>
-                        <div style="font-size:0.95rem;line-height:1.6;">{chunk_text_result}</div>
                     </div>
-                    """,
-                    unsafe_allow_html=True,
-                )
 else:
     st.info("👆 Upload a `.txt` or `.md` file to get started.")

 import os
+import sys
 import builtins
+# ── Self-relaunch guard ────────────────────────────────────────────────────────
+# HuggingFace Spaces (and some local setups) run `python app.py` instead of
+# `streamlit run app.py`.  Detect this and relaunch correctly.
+if not os.environ.get("STREAMLIT_SERVER_PORT"):
+    import subprocess
+    sys.exit(
+        subprocess.call([
+            sys.executable, "-m", "streamlit", "run", __file__,
+            "--server.address=0.0.0.0",
+            "--server.port=7860",
+            "--server.headless=true",
+            "--browser.gatherUsageStats=false",
+        ])
+    )
+# ── Auto-answer the transformers "custom code" prompt ─────────────────────────
 _real_input = builtins.input
 def _auto_yes(prompt=""):
+    if any(kw in str(prompt).lower() for kw in ("custom code", "trust", "wish to run")):
         return "y"
     return _real_input(prompt)
 builtins.input = _auto_yes
+os.environ["TRUST_REMOTE_CODE"]             = "1"
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"]  = "1"
+os.environ["TOKENIZERS_PARALLELISM"]        = "false"
+os.environ["HF_HUB_VERBOSITY"]             = "error"
+# ── Imports ────────────────────────────────────────────────────────────────────
 import streamlit as st
 import numpy as np
 import re
 from transformers import AutoModel
 try:
     import transformers.dynamic_module_utils as _dmu
+    _dmu.resolve_trust_remote_code = lambda *a, **kw: True
 except Exception:
     pass
+# ─────────────────────────── Page config ──────────────────────────────────────
+st.set_page_config(page_title="pplx-embed Semantic Search", page_icon="🔍", layout="wide")
 st.title("🔍 Semantic Search with pplx-embed-context-v1")
 st.caption(
     "Powered by [perplexity-ai/pplx-embed-context-v1-0.6B](https://huggingface.co/perplexity-ai/pplx-embed-context-v1-0.6b)."
 )
+# ─────────────────────────── Model loading ────────────────────────────────────
 @st.cache_resource(show_spinner="Loading embedding models — this takes ~30 s on first run…")
 def load_models():
+    ctx_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-context-v1-0.6B", trust_remote_code=True)
+    query_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B", trust_remote_code=True)
     return ctx_model, query_model
 ctx_model, query_model = load_models()
+# ─────────────────────────── Helpers ──────────────────────────────────────────
+def chunk_text(text, chunk_size=3, overlap=1):
     sentences = re.split(r'(?<=[.!?])\s+', text.strip())
     sentences = [s.strip() for s in sentences if s.strip()]
+    chunks, i = [], 0
     while i < len(sentences):
+        chunks.append(" ".join(sentences[i : i + chunk_size]))
         i += max(1, chunk_size - overlap)
     return chunks
+def cosine_similarity(a, b):
+    na, nb = np.linalg.norm(a), np.linalg.norm(b)
+    return float(np.dot(a, b) / (na * nb)) if na and nb else 0.0
+def embed_document(chunks):
+    return ctx_model.encode([chunks])[0]
+def embed_query(query):
+    return np.array(query_model.encode([query])[0]).flatten()
+def search(query, chunks, embeddings, top_k=5):
+    q = embed_query(query)
+    scores = [cosine_similarity(q, embeddings[i]) for i in range(len(chunks))]
     ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
     return [(chunks[idx], score) for idx, score in ranked[:top_k]]
+# ─────────────────────────── Sidebar ──────────────────────────────────────────
 with st.sidebar:
     st.header("⚙️ Settings")
+    chunk_size = st.slider("Sentences per chunk", 1, 8, 3)
+    overlap    = st.slider("Sentence overlap",    0, 4, 1)
+    top_k      = st.slider("Results to show",     1, 10, 5)
     st.markdown("---")
     st.markdown(
         "**How it works**\n\n"
+        "1. File split into overlapping sentence chunks.\n"
+        "2. All chunks embedded together as one document (context-aware).\n"
+        "3. Your question embedded with the query model.\n"
         "4. Cosine similarity ranks chunks by relevance."
     )
+# ─────────────────────────── File upload ──────────────────────────────────────
 uploaded = st.file_uploader("📄 Upload a document", type=["txt", "md"])
 if uploaded:
     with st.expander("📃 Preview document", expanded=False):
         st.text(raw_text[:3000] + ("…" if len(raw_text) > 3000 else ""))
     cache_key = (uploaded.name, uploaded.size, chunk_size, overlap)
     if st.session_state.get("cache_key") != cache_key:
         with st.spinner("Chunking and embedding document…"):
             chunks = chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
             embeddings = embed_document(chunks)
+        st.session_state.update(cache_key=cache_key, chunks=chunks, embeddings=embeddings)
         st.success(f"✅ Indexed **{len(chunks)}** chunks from *{uploaded.name}*")
     else:
         chunks     = st.session_state["chunks"]
         embeddings = st.session_state["embeddings"]
         st.info(f"✅ Using cached index — **{len(chunks)}** chunks from *{uploaded.name}*")
     st.markdown("---")
     query = st.text_input("💬 Ask a question about the document", placeholder="e.g. What is the main conclusion?")
     if st.button("🔍 Search", disabled=not query.strip()):
+        with st.spinner("Searching…"):
+            results = search(query, chunks, embeddings, top_k=top_k)
+        st.markdown("### 📌 Top Results")
+        for rank, (chunk_text_result, score) in enumerate(results, 1):
+            pct   = score * 100
+            color = "#2ecc71" if pct >= 60 else "#f39c12" if pct >= 35 else "#e74c3c"
+            st.markdown(
+                f"""<div style="border-left:4px solid {color};padding:12px 16px;
+                    margin-bottom:12px;background:#f8f9fa;border-radius:0 8px 8px 0;">
+                    <div style="font-size:.8rem;color:{color};font-weight:600;margin-bottom:6px;">
+                        #{rank} &nbsp;·&nbsp; Similarity: {pct:.1f}%
                     </div>
+                    <div style="font-size:.95rem;line-height:1.6;">{chunk_text_result}</div>
+                </div>""",
+                unsafe_allow_html=True,
+            )
 else:
     st.info("👆 Upload a `.txt` or `.md` file to get started.")