kamp0010 commited on
Commit
5a2fe7a
Β·
verified Β·
1 Parent(s): 5379d0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -101
app.py CHANGED
@@ -1,39 +1,49 @@
1
  import os
 
2
  import builtins
3
 
4
- # ── MUST happen before ANY other import ───────────────────────────────────────
5
- # transformers calls builtins.input() for the "Do you wish to run the custom
6
- # code? [y/N]" prompt. Patch it to always answer "y" silently.
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  _real_input = builtins.input
8
  def _auto_yes(prompt=""):
9
- if "custom code" in str(prompt).lower() or "trust" in str(prompt).lower():
10
  return "y"
11
  return _real_input(prompt)
12
  builtins.input = _auto_yes
13
 
14
- os.environ["TRUST_REMOTE_CODE"] = "1"
15
- os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
16
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
- os.environ["HF_HUB_VERBOSITY"] = "error"
18
 
 
19
  import streamlit as st
20
  import numpy as np
21
  import re
22
  from transformers import AutoModel
23
 
24
- # Belt-and-suspenders: patch the internal resolver too, after import
25
  try:
26
  import transformers.dynamic_module_utils as _dmu
27
- _dmu.resolve_trust_remote_code = lambda *a, **kw: True # type: ignore
28
  except Exception:
29
  pass
30
 
31
- # ─────────────────────────── Page config ────────────────────────────
32
- st.set_page_config(
33
- page_title="pplx-embed Semantic Search",
34
- page_icon="πŸ”",
35
- layout="wide",
36
- )
37
 
38
  st.title("πŸ” Semantic Search with pplx-embed-context-v1")
39
  st.caption(
@@ -41,88 +51,57 @@ st.caption(
41
  "Powered by [perplexity-ai/pplx-embed-context-v1-0.6B](https://huggingface.co/perplexity-ai/pplx-embed-context-v1-0.6b)."
42
  )
43
 
44
- # ─────────────────────────── Model loading ──────────────────────────
45
  @st.cache_resource(show_spinner="Loading embedding models β€” this takes ~30 s on first run…")
46
  def load_models():
47
- ctx_model = AutoModel.from_pretrained(
48
- "perplexity-ai/pplx-embed-context-v1-0.6B",
49
- trust_remote_code=True,
50
- )
51
- query_model = AutoModel.from_pretrained(
52
- "perplexity-ai/pplx-embed-v1-0.6B",
53
- trust_remote_code=True,
54
- )
55
  return ctx_model, query_model
56
 
57
  ctx_model, query_model = load_models()
58
 
59
- # ─────────────────────────── Helpers ────────────────────────────────
60
- def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
61
- """Split text into sentence-based chunks with overlap."""
62
- # Split into sentences (rough split on . ! ? followed by whitespace)
63
  sentences = re.split(r'(?<=[.!?])\s+', text.strip())
64
  sentences = [s.strip() for s in sentences if s.strip()]
65
-
66
- chunks = []
67
- i = 0
68
  while i < len(sentences):
69
- chunk = " ".join(sentences[i : i + chunk_size])
70
- chunks.append(chunk)
71
  i += max(1, chunk_size - overlap)
72
-
73
  return chunks
74
 
 
 
 
75
 
76
- def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
77
- """Cosine similarity between two 1-D vectors."""
78
- norm_a = np.linalg.norm(a)
79
- norm_b = np.linalg.norm(b)
80
- if norm_a == 0 or norm_b == 0:
81
- return 0.0
82
- return float(np.dot(a, b) / (norm_a * norm_b))
83
-
84
-
85
- def embed_document(chunks: list[str]) -> np.ndarray:
86
- """
87
- Embed all chunks as ONE document so the context model sees surrounding
88
- chunks. Returns shape (n_chunks, 1024).
89
- """
90
- embeddings_list = ctx_model.encode([chunks]) # list of 1 numpy array
91
- return embeddings_list[0] # (n_chunks, 1024)
92
 
 
 
93
 
94
- def embed_query(query: str) -> np.ndarray:
95
- """Embed a single query string. Returns shape (1024,)."""
96
- # query model expects list[str] β†’ returns list of 1-D arrays
97
- result = query_model.encode([query])
98
- return np.array(result[0]).flatten()
99
-
100
-
101
- def search(query: str, chunks: list[str], chunk_embeddings: np.ndarray, top_k: int = 5):
102
- """Return top-k chunks ranked by cosine similarity to query."""
103
- q_emb = embed_query(query)
104
- scores = [cosine_similarity(q_emb, chunk_embeddings[i]) for i in range(len(chunks))]
105
  ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
106
  return [(chunks[idx], score) for idx, score in ranked[:top_k]]
107
 
108
-
109
- # ─────────────────────────── Sidebar ────────────────────────────────
110
  with st.sidebar:
111
  st.header("βš™οΈ Settings")
112
- chunk_size = st.slider("Sentences per chunk", min_value=1, max_value=8, value=3)
113
- overlap = st.slider("Sentence overlap", min_value=0, max_value=4, value=1)
114
- top_k = st.slider("Results to show", min_value=1, max_value=10, value=5)
115
  st.markdown("---")
116
  st.markdown(
117
  "**How it works**\n\n"
118
- "1. Your file is split into overlapping sentence chunks.\n"
119
- "2. All chunks are embedded together as one document using the *context* model "
120
- "so each chunk is aware of its neighbours.\n"
121
- "3. Your question is embedded with the *query* model.\n"
122
  "4. Cosine similarity ranks chunks by relevance."
123
  )
124
 
125
- # ─────────────────────────── File upload ────────────────────────────
126
  uploaded = st.file_uploader("πŸ“„ Upload a document", type=["txt", "md"])
127
 
128
  if uploaded:
@@ -131,50 +110,38 @@ if uploaded:
131
  with st.expander("πŸ“ƒ Preview document", expanded=False):
132
  st.text(raw_text[:3000] + ("…" if len(raw_text) > 3000 else ""))
133
 
134
- # Re-chunk & re-embed whenever the file or settings change
135
  cache_key = (uploaded.name, uploaded.size, chunk_size, overlap)
136
  if st.session_state.get("cache_key") != cache_key:
137
  with st.spinner("Chunking and embedding document…"):
138
  chunks = chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
139
  embeddings = embed_document(chunks)
140
- st.session_state["cache_key"] = cache_key
141
- st.session_state["chunks"] = chunks
142
- st.session_state["embeddings"] = embeddings
143
  st.success(f"βœ… Indexed **{len(chunks)}** chunks from *{uploaded.name}*")
144
  else:
145
  chunks = st.session_state["chunks"]
146
  embeddings = st.session_state["embeddings"]
147
  st.info(f"βœ… Using cached index β€” **{len(chunks)}** chunks from *{uploaded.name}*")
148
 
149
- # ─────────────────────────── Query ──────────────────────────────
150
  st.markdown("---")
151
  query = st.text_input("πŸ’¬ Ask a question about the document", placeholder="e.g. What is the main conclusion?")
152
 
153
  if st.button("πŸ” Search", disabled=not query.strip()):
154
- if query.strip():
155
- with st.spinner("Searching…"):
156
- results = search(query, chunks, embeddings, top_k=top_k)
157
-
158
- st.markdown("### πŸ“Œ Top Results")
159
- for rank, (chunk_text_result, score) in enumerate(results, 1):
160
- pct = score * 100
161
- color = "#2ecc71" if pct >= 60 else "#f39c12" if pct >= 35 else "#e74c3c"
162
- st.markdown(
163
- f"""
164
- <div style="
165
- border-left: 4px solid {color};
166
- padding: 12px 16px;
167
- margin-bottom: 12px;
168
- background: #f8f9fa;
169
- border-radius: 0 8px 8px 0;
170
- ">
171
- <div style="font-size:0.8rem;color:{color};font-weight:600;margin-bottom:6px;">
172
- #{rank} &nbsp;Β·&nbsp; Similarity: {pct:.1f}%
173
- </div>
174
- <div style="font-size:0.95rem;line-height:1.6;">{chunk_text_result}</div>
175
  </div>
176
- """,
177
- unsafe_allow_html=True,
178
- )
 
179
  else:
180
  st.info("πŸ‘† Upload a `.txt` or `.md` file to get started.")
 
1
  import os
2
+ import sys
3
  import builtins
4
 
5
+ # ── Self-relaunch guard ────────────────────────────────────────────────────────
6
+ # HuggingFace Spaces (and some local setups) run `python app.py` instead of
7
+ # `streamlit run app.py`. Detect this and relaunch correctly.
8
+ if not os.environ.get("STREAMLIT_SERVER_PORT"):
9
+ import subprocess
10
+ sys.exit(
11
+ subprocess.call([
12
+ sys.executable, "-m", "streamlit", "run", __file__,
13
+ "--server.address=0.0.0.0",
14
+ "--server.port=7860",
15
+ "--server.headless=true",
16
+ "--browser.gatherUsageStats=false",
17
+ ])
18
+ )
19
+
20
+ # ── Auto-answer the transformers "custom code" prompt ─────────────────────────
21
  _real_input = builtins.input
22
  def _auto_yes(prompt=""):
23
+ if any(kw in str(prompt).lower() for kw in ("custom code", "trust", "wish to run")):
24
  return "y"
25
  return _real_input(prompt)
26
  builtins.input = _auto_yes
27
 
28
+ os.environ["TRUST_REMOTE_CODE"] = "1"
29
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
30
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
31
+ os.environ["HF_HUB_VERBOSITY"] = "error"
32
 
33
+ # ── Imports ────────────────────────────────────────────────────────────────────
34
  import streamlit as st
35
  import numpy as np
36
  import re
37
  from transformers import AutoModel
38
 
 
39
  try:
40
  import transformers.dynamic_module_utils as _dmu
41
+ _dmu.resolve_trust_remote_code = lambda *a, **kw: True
42
  except Exception:
43
  pass
44
 
45
+ # ─────────────────────────── Page config ──────────────────────────────────────
46
+ st.set_page_config(page_title="pplx-embed Semantic Search", page_icon="πŸ”", layout="wide")
 
 
 
 
47
 
48
  st.title("πŸ” Semantic Search with pplx-embed-context-v1")
49
  st.caption(
 
51
  "Powered by [perplexity-ai/pplx-embed-context-v1-0.6B](https://huggingface.co/perplexity-ai/pplx-embed-context-v1-0.6b)."
52
  )
53
 
54
+ # ─────────────────────────── Model loading ────────────────────────────────────
55
  @st.cache_resource(show_spinner="Loading embedding models β€” this takes ~30 s on first run…")
56
  def load_models():
57
+ ctx_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-context-v1-0.6B", trust_remote_code=True)
58
+ query_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B", trust_remote_code=True)
 
 
 
 
 
 
59
  return ctx_model, query_model
60
 
61
  ctx_model, query_model = load_models()
62
 
63
+ # ─────────────────────────── Helpers ──────────────────────────────────────────
64
+ def chunk_text(text, chunk_size=3, overlap=1):
 
 
65
  sentences = re.split(r'(?<=[.!?])\s+', text.strip())
66
  sentences = [s.strip() for s in sentences if s.strip()]
67
+ chunks, i = [], 0
 
 
68
  while i < len(sentences):
69
+ chunks.append(" ".join(sentences[i : i + chunk_size]))
 
70
  i += max(1, chunk_size - overlap)
 
71
  return chunks
72
 
73
+ def cosine_similarity(a, b):
74
+ na, nb = np.linalg.norm(a), np.linalg.norm(b)
75
+ return float(np.dot(a, b) / (na * nb)) if na and nb else 0.0
76
 
77
+ def embed_document(chunks):
78
+ return ctx_model.encode([chunks])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ def embed_query(query):
81
+ return np.array(query_model.encode([query])[0]).flatten()
82
 
83
+ def search(query, chunks, embeddings, top_k=5):
84
+ q = embed_query(query)
85
+ scores = [cosine_similarity(q, embeddings[i]) for i in range(len(chunks))]
 
 
 
 
 
 
 
 
86
  ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
87
  return [(chunks[idx], score) for idx, score in ranked[:top_k]]
88
 
89
+ # ─────────────────────────── Sidebar ──────────────────────────────────────────
 
90
  with st.sidebar:
91
  st.header("βš™οΈ Settings")
92
+ chunk_size = st.slider("Sentences per chunk", 1, 8, 3)
93
+ overlap = st.slider("Sentence overlap", 0, 4, 1)
94
+ top_k = st.slider("Results to show", 1, 10, 5)
95
  st.markdown("---")
96
  st.markdown(
97
  "**How it works**\n\n"
98
+ "1. File split into overlapping sentence chunks.\n"
99
+ "2. All chunks embedded together as one document (context-aware).\n"
100
+ "3. Your question embedded with the query model.\n"
 
101
  "4. Cosine similarity ranks chunks by relevance."
102
  )
103
 
104
+ # ─────────────────────────── File upload ──────────────────────────────────────
105
  uploaded = st.file_uploader("πŸ“„ Upload a document", type=["txt", "md"])
106
 
107
  if uploaded:
 
110
  with st.expander("πŸ“ƒ Preview document", expanded=False):
111
  st.text(raw_text[:3000] + ("…" if len(raw_text) > 3000 else ""))
112
 
 
113
  cache_key = (uploaded.name, uploaded.size, chunk_size, overlap)
114
  if st.session_state.get("cache_key") != cache_key:
115
  with st.spinner("Chunking and embedding document…"):
116
  chunks = chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
117
  embeddings = embed_document(chunks)
118
+ st.session_state.update(cache_key=cache_key, chunks=chunks, embeddings=embeddings)
 
 
119
  st.success(f"βœ… Indexed **{len(chunks)}** chunks from *{uploaded.name}*")
120
  else:
121
  chunks = st.session_state["chunks"]
122
  embeddings = st.session_state["embeddings"]
123
  st.info(f"βœ… Using cached index β€” **{len(chunks)}** chunks from *{uploaded.name}*")
124
 
 
125
  st.markdown("---")
126
  query = st.text_input("πŸ’¬ Ask a question about the document", placeholder="e.g. What is the main conclusion?")
127
 
128
  if st.button("πŸ” Search", disabled=not query.strip()):
129
+ with st.spinner("Searching…"):
130
+ results = search(query, chunks, embeddings, top_k=top_k)
131
+
132
+ st.markdown("### πŸ“Œ Top Results")
133
+ for rank, (chunk_text_result, score) in enumerate(results, 1):
134
+ pct = score * 100
135
+ color = "#2ecc71" if pct >= 60 else "#f39c12" if pct >= 35 else "#e74c3c"
136
+ st.markdown(
137
+ f"""<div style="border-left:4px solid {color};padding:12px 16px;
138
+ margin-bottom:12px;background:#f8f9fa;border-radius:0 8px 8px 0;">
139
+ <div style="font-size:.8rem;color:{color};font-weight:600;margin-bottom:6px;">
140
+ #{rank} &nbsp;Β·&nbsp; Similarity: {pct:.1f}%
 
 
 
 
 
 
 
 
 
141
  </div>
142
+ <div style="font-size:.95rem;line-height:1.6;">{chunk_text_result}</div>
143
+ </div>""",
144
+ unsafe_allow_html=True,
145
+ )
146
  else:
147
  st.info("πŸ‘† Upload a `.txt` or `.md` file to get started.")