ChatBotsTA commited on
Commit
1a83899
Β·
verified Β·
1 Parent(s): ab1429e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -55
app.py CHANGED
@@ -1,14 +1,28 @@
1
- import os, io, re, json, base64, requests, numpy as np
 
 
 
 
 
 
2
  import streamlit as st
3
  from pypdf import PdfReader
4
  import matplotlib.pyplot as plt
5
 
6
  # -----------------------------
7
- # Config
8
  # -----------------------------
9
  st.set_page_config(page_title="PDF Summarizer + Audio + QA", page_icon="πŸ“„", layout="wide")
10
 
11
- HF_TOKEN = os.environ.get("HF_TOKEN", st.secrets.get("HF_TOKEN", ""))
 
 
 
 
 
 
 
 
12
  HEADERS_JSON = {
13
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
14
  "Content-Type": "application/json",
@@ -21,107 +35,182 @@ EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
  QA_MODEL = "deepset/roberta-base-squad2"
22
 
23
  # -----------------------------
24
- # API helpers
25
  # -----------------------------
26
- def hf_infer_json(model_id: str, payload: dict, router=False, accept=None):
 
 
 
 
 
27
  if router:
28
  url = f"https://router.huggingface.co/hf-inference/models/{model_id}"
29
  else:
30
  url = f"https://api-inference.huggingface.co/models/{model_id}"
 
31
  headers = HEADERS_JSON.copy()
32
  if accept:
33
  headers["Accept"] = accept
34
- r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
35
- r.raise_for_status()
 
 
 
 
 
 
 
36
  try:
37
  return r.json()
38
- except requests.exceptions.JSONDecodeError:
39
  return r.content
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def split_into_chunks(text: str, max_chars: int = 1800, overlap: int = 200):
42
- text = re.sub(r"\s+", " ", text).strip()
43
  chunks = []
44
  i = 0
45
  while i < len(text):
46
  chunk = text[i:i+max_chars]
47
  last_dot = chunk.rfind(". ")
48
  if last_dot > 400:
49
- chunk = chunk[:last_dot+1]
50
  i += last_dot + 1 - overlap
51
  else:
52
  i += max_chars - overlap
53
  chunks.append(chunk.strip())
54
  return [c for c in chunks if c]
55
 
 
 
 
56
  def embed_texts(texts):
 
 
 
 
57
  url = f"https://router.huggingface.co/hf-inference/models/{EMB_MODEL}/pipeline/feature-extraction"
58
  headers = {
59
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
60
  "Content-Type": "application/json",
61
  "Accept": "application/json",
62
  }
63
- r = requests.post(url, headers=headers, data=json.dumps({"inputs": texts}), timeout=120)
64
- r.raise_for_status()
 
 
 
 
65
  arr = np.array(r.json(), dtype=np.float32)
 
 
 
 
 
 
 
66
  if arr.ndim == 2:
67
- return arr.mean(axis=0, keepdims=True)
68
  if arr.ndim == 3:
69
- pooled = [a.mean(axis=0) for a in arr]
70
- return np.vstack(pooled)
71
- return np.array(arr)
 
72
 
73
  def cosine_sim(a, b):
74
- a = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
75
- b = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
76
- return a @ b.T
 
 
 
77
 
 
 
 
78
  def summarize_long_text(text: str, per_chunk_max_len=220, final_max_len=250):
79
  chunks = split_into_chunks(text, max_chars=1800, overlap=200)
80
  mini_summaries = []
81
  for c in chunks:
82
- out = hf_infer_json(
83
- SUMMARIZER_MODEL,
84
- {"inputs": c, "parameters": {"max_length": per_chunk_max_len, "min_length": 60, "do_sample": False}},
85
- router=False
86
- )
87
- if isinstance(out, list) and len(out) and "summary_text" in out[0]:
 
 
 
 
 
 
 
88
  mini_summaries.append(out[0]["summary_text"])
 
 
89
  else:
90
  mini_summaries.append(c[:1000])
 
91
  joined = " ".join(mini_summaries)
92
- final = hf_infer_json(
93
- SUMMARIZER_MODEL,
94
- {"inputs": joined, "parameters": {"max_length": final_max_len, "min_length": 80, "do_sample": False}},
95
- router=False
96
- )
97
- if isinstance(final, list) and len(final) and "summary_text" in final[0]:
 
 
 
 
98
  return final[0]["summary_text"], chunks
 
 
 
99
  return joined[:1200], chunks
100
 
 
 
 
101
  def tts_wav_bytes(text: str) -> bytes:
102
- res = hf_infer_json(TTS_MODEL, {"inputs": text}, router=False, accept="audio/wav")
 
 
 
 
103
  if isinstance(res, (bytes, bytearray)):
104
  return res
105
  if isinstance(res, dict) and "audio" in res:
106
  try:
107
  return base64.b64decode(res["audio"])
108
- except:
109
  pass
110
  raise RuntimeError("TTS API did not return audio bytes.")
111
 
112
- def extract_text_from_pdf(file) -> str:
113
- reader = PdfReader(file)
114
- pages = []
115
- for p in reader.pages:
116
- try:
117
- pages.append(p.extract_text() or "")
118
- except:
119
- pages.append("")
120
- return "\n".join(pages)
121
-
122
  def make_word_freq_chart(text: str, top_k=20):
123
  text = text.lower()
124
- stop = set(("the a an and of to in is are for with on by as at this that from be was were it its it’s into or if not your you we they their our can may such more most other also than which".split()))
 
 
 
 
125
  tokens = re.findall(r"[a-zA-Z]{3,}", text)
126
  freq = {}
127
  for t in tokens:
@@ -147,10 +236,11 @@ st.title("πŸ“„ PDF β†’ Summary Β· πŸ”Š Audio Β· πŸ“Š Chart Β· ❓ Q&A")
147
  st.caption("Powered by Hugging Face Hosted Inference API (free models).")
148
 
149
  if not HF_TOKEN:
150
- st.warning("Set HF_TOKEN in environment or in your Space secrets to use the Hosted Inference API.")
151
 
152
  uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
153
 
 
154
  if "doc_text" not in st.session_state:
155
  st.session_state.doc_text = ""
156
  st.session_state.chunks = []
@@ -169,12 +259,15 @@ if uploaded:
169
  with c1:
170
  if st.button("πŸ“ Summarize"):
171
  with st.spinner("Summarizing..."):
172
- summary, chunks = summarize_long_text(st.session_state.doc_text)
173
- st.session_state.summary = summary
174
- st.session_state.chunks = chunks
175
- st.success("Summary ready.")
176
- st.write("#### Summary")
177
- st.write(summary)
 
 
 
178
 
179
  with c2:
180
  if st.button("πŸ”Š Generate Audio (summary)"):
@@ -200,11 +293,10 @@ if uploaded:
200
  st.session_state.chunks = split_into_chunks(st.session_state.doc_text)
201
  with st.spinner("Thinking..."):
202
  try:
 
203
  if st.session_state.chunk_vecs is None:
204
- vecs = embed_texts(st.session_state.chunks)
205
- st.session_state.chunk_vecs = vecs
206
- else:
207
- vecs = st.session_state.chunk_vecs
208
 
209
  q_vec = embed_texts([question])
210
  sims = cosine_sim(q_vec, vecs).flatten()
 
1
+ import os
2
+ import io
3
+ import re
4
+ import json
5
+ import base64
6
+ import requests
7
+ import numpy as np
8
  import streamlit as st
9
  from pypdf import PdfReader
10
  import matplotlib.pyplot as plt
11
 
12
  # -----------------------------
13
+ # Config / Secrets (safe)
14
  # -----------------------------
15
  st.set_page_config(page_title="PDF Summarizer + Audio + QA", page_icon="πŸ“„", layout="wide")
16
 
17
+ # Prefer environment variable (Spaces sets secrets as env vars), *then* try st.secrets safely.
18
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
19
+ if not HF_TOKEN:
20
+ try:
21
+ # Access st.secrets inside try/except so we don't crash when no secrets file exists.
22
+ HF_TOKEN = st.secrets.get("HF_TOKEN", "") if hasattr(st, "secrets") else ""
23
+ except Exception:
24
+ HF_TOKEN = ""
25
+
26
  HEADERS_JSON = {
27
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
28
  "Content-Type": "application/json",
 
35
  QA_MODEL = "deepset/roberta-base-squad2"
36
 
37
  # -----------------------------
38
+ # Helper: Hugging Face inference
39
  # -----------------------------
40
+ def hf_infer_json(model_id: str, payload: dict, router=False, accept=None, timeout=120):
41
+ """
42
+ Send request to Hugging Face Hosted Inference API.
43
+ If `router=True` we'll use the router base path (useful for some pipelines).
44
+ If backend returns binary (audio), this returns raw bytes.
45
+ """
46
  if router:
47
  url = f"https://router.huggingface.co/hf-inference/models/{model_id}"
48
  else:
49
  url = f"https://api-inference.huggingface.co/models/{model_id}"
50
+
51
  headers = HEADERS_JSON.copy()
52
  if accept:
53
  headers["Accept"] = accept
54
+
55
+ try:
56
+ r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout)
57
+ r.raise_for_status()
58
+ except requests.exceptions.RequestException as e:
59
+ # Bubble up a useful message
60
+ raise RuntimeError(f"Hugging Face request failed: {e}")
61
+
62
+ # Try to decode JSON; if fails, return bytes/content
63
  try:
64
  return r.json()
65
+ except ValueError:
66
  return r.content
67
 
68
+ # -----------------------------
69
+ # Text / PDF utilities
70
+ # -----------------------------
71
+ def extract_text_from_pdf(file) -> str:
72
+ reader = PdfReader(file)
73
+ pages = []
74
+ for p in reader.pages:
75
+ try:
76
+ pages.append(p.extract_text() or "")
77
+ except Exception:
78
+ pages.append("")
79
+ return "\n".join(pages)
80
+
81
+ def clean_text(s: str) -> str:
82
+ return re.sub(r"\s+", " ", s).strip()
83
+
84
  def split_into_chunks(text: str, max_chars: int = 1800, overlap: int = 200):
85
+ text = clean_text(text)
86
  chunks = []
87
  i = 0
88
  while i < len(text):
89
  chunk = text[i:i+max_chars]
90
  last_dot = chunk.rfind(". ")
91
  if last_dot > 400:
92
+ chunk = chunk[: last_dot + 1]
93
  i += last_dot + 1 - overlap
94
  else:
95
  i += max_chars - overlap
96
  chunks.append(chunk.strip())
97
  return [c for c in chunks if c]
98
 
99
+ # -----------------------------
100
+ # Embeddings + similarity
101
+ # -----------------------------
102
  def embed_texts(texts):
103
+ """
104
+ Calls the feature-extraction pipeline on the router endpoint.
105
+ Returns numpy array shape (n_texts, dim)
106
+ """
107
  url = f"https://router.huggingface.co/hf-inference/models/{EMB_MODEL}/pipeline/feature-extraction"
108
  headers = {
109
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
110
  "Content-Type": "application/json",
111
  "Accept": "application/json",
112
  }
113
+ try:
114
+ r = requests.post(url, headers=headers, data=json.dumps({"inputs": texts}), timeout=120)
115
+ r.raise_for_status()
116
+ except requests.exceptions.RequestException as e:
117
+ raise RuntimeError(f"Embedding request failed: {e}")
118
+
119
  arr = np.array(r.json(), dtype=np.float32)
120
+
121
+ # Cases:
122
+ # - arr.ndim == 1 -> single vector (dim,) -> reshape to (1,dim)
123
+ # - arr.ndim == 2 -> batch of vectors (n, dim) -> return as-is
124
+ # - arr.ndim == 3 -> model returned token-level vectors per item: mean-pool per item -> (n, dim)
125
+ if arr.ndim == 1:
126
+ return arr.reshape(1, -1)
127
  if arr.ndim == 2:
128
+ return arr
129
  if arr.ndim == 3:
130
+ pooled = np.array([a.mean(axis=0) for a in arr])
131
+ return pooled
132
+ # Fallback
133
+ return arr.reshape(arr.shape[0], -1)
134
 
135
  def cosine_sim(a, b):
136
+ """
137
+ a: (m, d), b: (n, d) -> returns (m, n)
138
+ """
139
+ a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
140
+ b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
141
+ return a_n @ b_n.T
142
 
143
+ # -----------------------------
144
+ # Summarization
145
+ # -----------------------------
146
  def summarize_long_text(text: str, per_chunk_max_len=220, final_max_len=250):
147
  chunks = split_into_chunks(text, max_chars=1800, overlap=200)
148
  mini_summaries = []
149
  for c in chunks:
150
+ try:
151
+ out = hf_infer_json(
152
+ SUMMARIZER_MODEL,
153
+ {"inputs": c, "parameters": {"max_length": per_chunk_max_len, "min_length": 60, "do_sample": False}},
154
+ router=False,
155
+ )
156
+ except Exception as e:
157
+ # if API fails, include the chunk (truncated) as fallback
158
+ mini_summaries.append(c[:1000])
159
+ continue
160
+
161
+ # Hosted inference often returns a list of dicts with 'summary_text'
162
+ if isinstance(out, list) and len(out) and isinstance(out[0], dict) and "summary_text" in out[0]:
163
  mini_summaries.append(out[0]["summary_text"])
164
+ elif isinstance(out, dict) and "summary_text" in out:
165
+ mini_summaries.append(out["summary_text"])
166
  else:
167
  mini_summaries.append(c[:1000])
168
+
169
  joined = " ".join(mini_summaries)
170
+ try:
171
+ final = hf_infer_json(
172
+ SUMMARIZER_MODEL,
173
+ {"inputs": joined, "parameters": {"max_length": final_max_len, "min_length": 80, "do_sample": False}},
174
+ router=False,
175
+ )
176
+ except Exception:
177
+ return joined[:1200], chunks
178
+
179
+ if isinstance(final, list) and len(final) and isinstance(final[0], dict) and "summary_text" in final[0]:
180
  return final[0]["summary_text"], chunks
181
+ if isinstance(final, dict) and "summary_text" in final:
182
+ return final["summary_text"], chunks
183
+
184
  return joined[:1200], chunks
185
 
186
+ # -----------------------------
187
+ # TTS
188
+ # -----------------------------
189
  def tts_wav_bytes(text: str) -> bytes:
190
+ try:
191
+ res = hf_infer_json(TTS_MODEL, {"inputs": text}, router=False, accept="audio/wav", timeout=180)
192
+ except Exception as e:
193
+ raise RuntimeError(f"TTS request failed: {e}")
194
+
195
  if isinstance(res, (bytes, bytearray)):
196
  return res
197
  if isinstance(res, dict) and "audio" in res:
198
  try:
199
  return base64.b64decode(res["audio"])
200
+ except Exception:
201
  pass
202
  raise RuntimeError("TTS API did not return audio bytes.")
203
 
204
+ # -----------------------------
205
+ # Visualization helper
206
+ # -----------------------------
 
 
 
 
 
 
 
207
  def make_word_freq_chart(text: str, top_k=20):
208
  text = text.lower()
209
+ stop = set(
210
+ (
211
+ "the a an and of to in is are for with on by as at this that from be was were it its it's into or if not your you we they their our can may such more most other also than which".split()
212
+ )
213
+ )
214
  tokens = re.findall(r"[a-zA-Z]{3,}", text)
215
  freq = {}
216
  for t in tokens:
 
236
  st.caption("Powered by Hugging Face Hosted Inference API (free models).")
237
 
238
  if not HF_TOKEN:
239
+ st.warning("No HF_TOKEN found. Add HF_TOKEN in Space Settings β†’ Secrets (recommended). The app will still run but HF API calls will fail without a token.")
240
 
241
  uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
242
 
243
+ # session state
244
  if "doc_text" not in st.session_state:
245
  st.session_state.doc_text = ""
246
  st.session_state.chunks = []
 
259
  with c1:
260
  if st.button("πŸ“ Summarize"):
261
  with st.spinner("Summarizing..."):
262
+ try:
263
+ summary, chunks = summarize_long_text(st.session_state.doc_text)
264
+ st.session_state.summary = summary
265
+ st.session_state.chunks = chunks
266
+ st.success("Summary ready.")
267
+ st.write("#### Summary")
268
+ st.write(summary)
269
+ except Exception as e:
270
+ st.error(f"Summarization failed: {e}")
271
 
272
  with c2:
273
  if st.button("πŸ”Š Generate Audio (summary)"):
 
293
  st.session_state.chunks = split_into_chunks(st.session_state.doc_text)
294
  with st.spinner("Thinking..."):
295
  try:
296
+ # embed once/cache
297
  if st.session_state.chunk_vecs is None:
298
+ st.session_state.chunk_vecs = embed_texts(st.session_state.chunks)
299
+ vecs = st.session_state.chunk_vecs
 
 
300
 
301
  q_vec = embed_texts([question])
302
  sims = cosine_sim(q_vec, vecs).flatten()