EdwardConstantine commited on
Commit
3154aad
Β·
verified Β·
1 Parent(s): 6bc0b5d

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +198 -70
src/streamlit_app.py CHANGED
@@ -1,21 +1,28 @@
1
  import streamlit as st
2
  import os
 
3
  import pdfplumber
4
  from io import BytesIO
5
- from PIL import Image
6
  from docx import Document
7
  import pandas as pd
8
  import numpy as np
9
  import faiss
10
- from sentence_transformers import SentenceTransformer
11
  from huggingface_hub import InferenceClient
12
 
 
 
 
 
 
 
13
  # ============== CONFIG ==============
14
  CHUNK_SIZE = 500
15
  CHUNK_OVERLAP = 50
 
16
 
17
- # ============== TEXT PROCESSING ==============
18
- def chunk_text(text: str) -> list[dict]:
 
19
  if not text or not text.strip():
20
  return []
21
 
@@ -28,13 +35,17 @@ def chunk_text(text: str) -> list[dict]:
28
  end = start + CHUNK_SIZE
29
  chunk_content = text[start:end]
30
 
 
31
  if end < len(text):
32
  last_period = chunk_content.rfind(". ")
33
  if last_period > CHUNK_SIZE * 0.5:
34
  chunk_content = chunk_content[:last_period + 1]
35
  end = start + last_period + 1
36
 
37
- chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
 
 
 
38
  chunk_index += 1
39
  start = end - CHUNK_OVERLAP
40
 
@@ -44,7 +55,8 @@ def chunk_text(text: str) -> list[dict]:
44
  return chunks
45
 
46
  # ============== DOCUMENT PARSERS ==============
47
- def parse_pdf(file_bytes) -> str:
 
48
  text_parts = []
49
  with pdfplumber.open(BytesIO(file_bytes)) as pdf:
50
  for i, page in enumerate(pdf.pages):
@@ -53,26 +65,31 @@ def parse_pdf(file_bytes) -> str:
53
  text_parts.append(f"[Page {i + 1}]\n{page_text}")
54
  return "\n\n".join(text_parts)
55
 
56
- def parse_docx(file_bytes) -> str:
 
57
  doc = Document(BytesIO(file_bytes))
58
  paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
59
  return "\n\n".join(paragraphs)
60
 
61
- def parse_txt(file_bytes) -> str:
 
62
  return file_bytes.decode("utf-8")
63
 
64
- def parse_image(file_bytes) -> str:
65
- return "[Image uploaded - OCR not available in cloud version]"
66
-
67
- def parse_csv(file_bytes) -> str:
68
  df = pd.read_csv(BytesIO(file_bytes))
69
- lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
 
 
 
 
70
  for idx, row in df.head(50).iterrows():
71
  row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
72
  lines.append(row_text)
73
  return "\n".join(lines)
74
 
75
- def parse_document(file_bytes, filename) -> dict:
 
76
  ext = filename.split(".")[-1].lower()
77
 
78
  if ext == "pdf":
@@ -81,55 +98,76 @@ def parse_document(file_bytes, filename) -> dict:
81
  text = parse_docx(file_bytes)
82
  elif ext == "txt":
83
  text = parse_txt(file_bytes)
84
- elif ext in ["jpg", "jpeg", "png"]:
85
- text = parse_image(file_bytes)
86
  elif ext == "csv":
87
  text = parse_csv(file_bytes)
88
  else:
89
- text = ""
90
 
91
  chunks = chunk_text(text)
 
 
92
  for chunk in chunks:
93
  chunk["source"] = filename
94
  chunk["file_type"] = ext
95
 
96
  return {"text": text, "chunks": chunks}
97
 
98
- # ============== EMBEDDING SERVICE ==============
99
- @st.cache_resource
100
- def load_embedding_model():
101
- return SentenceTransformer("all-MiniLM-L6-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- def embed_texts(texts: list[str]) -> np.ndarray:
104
- model = load_embedding_model()
105
- return model.encode(texts)
106
 
107
- # ============== VECTOR STORE ==============
108
- class SimpleVectorStore:
 
 
109
  def __init__(self):
110
  self.index = None
111
  self.documents = []
112
- self.dimension = 384
113
 
114
- def add_documents(self, chunks: list[dict]):
 
115
  if not chunks:
116
  return 0
117
 
118
  texts = [c["content"] for c in chunks]
119
- embeddings = embed_texts(texts).astype("float32")
120
 
121
  if self.index is None:
122
- self.index = faiss.IndexFlatL2(self.dimension)
123
 
124
  self.index.add(embeddings)
125
  self.documents.extend(chunks)
126
  return len(chunks)
127
 
128
- def search(self, query: str, top_k: int = 5) -> list[dict]:
 
129
  if self.index is None or self.index.ntotal == 0:
130
  return []
131
 
132
- query_embedding = embed_texts([query]).astype("float32")
133
  distances, indices = self.index.search(query_embedding, top_k)
134
 
135
  results = []
@@ -141,19 +179,43 @@ class SimpleVectorStore:
141
  return results
142
 
143
  def clear(self):
 
144
  self.index = None
145
  self.documents = []
 
 
 
 
 
 
 
146
 
147
- # ============== LLM SERVICE ==============
148
- @st.cache_resource
149
  def get_llm_client():
150
- return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- def generate_answer(question: str, context: str) -> str:
153
- prompt = f"""You are a helpful assistant. Answer based on the context below.
154
  CONTEXT:
155
  {context}
 
 
 
 
 
 
 
156
  QUESTION: {question}
 
157
  ANSWER:"""
158
 
159
  try:
@@ -167,68 +229,134 @@ ANSWER:"""
167
  except Exception as e:
168
  return f"Error: {str(e)}"
169
 
170
- # ============== STREAMLIT APP ==============
171
- st.set_page_config(page_title="Smart RAG API", page_icon="πŸ”", layout="wide")
 
 
 
 
172
 
173
  st.title("πŸ” Smart RAG API")
174
- st.markdown("Upload documents and ask questions - Powered by HuggingFace")
 
 
 
 
175
 
 
176
  if "vector_store" not in st.session_state:
177
- st.session_state.vector_store = SimpleVectorStore()
178
 
179
  # Sidebar
180
  with st.sidebar:
181
  st.header("πŸ“Š Status")
 
182
  st.success("βœ… Running")
183
- st.metric("Documents", len(st.session_state.vector_store.documents))
 
 
 
184
 
185
- if st.button("πŸ—‘οΈ Clear All"):
186
  st.session_state.vector_store.clear()
 
187
  st.rerun()
188
 
189
  st.divider()
190
- st.markdown("**Supported:** PDF, DOCX, TXT, CSV")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # Main columns
193
  col1, col2 = st.columns(2)
194
 
 
195
  with col1:
196
- st.header("πŸ“ Upload")
197
- uploaded_file = st.file_uploader("Choose file", type=["pdf", "docx", "txt", "csv"])
198
-
199
- if uploaded_file and st.button("πŸ“€ Process", type="primary"):
200
- with st.spinner("Processing..."):
201
- try:
202
- parsed = parse_document(uploaded_file.getvalue(), uploaded_file.name)
203
- added = st.session_state.vector_store.add_documents(parsed["chunks"])
204
- st.success(f"βœ… Added {added} chunks")
205
- except Exception as e:
206
- st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
 
208
  with col2:
209
- st.header("πŸ’¬ Ask")
210
- question = st.text_area("Question:", placeholder="What is this about?")
211
- top_k = st.slider("Sources", 1, 5, 3)
212
 
213
- if st.button("πŸ” Answer", type="primary"):
 
 
 
 
 
 
 
 
214
  if not question:
215
- st.warning("Enter a question")
216
- elif not st.session_state.vector_store.documents:
217
- st.warning("Upload documents first")
218
  else:
219
- with st.spinner("Thinking..."):
 
220
  results = st.session_state.vector_store.search(question, top_k)
 
221
  if results:
222
- context = "\n\n".join([f"[{r['source']}]: {r['content']}" for r in results])
 
 
 
 
 
 
223
  answer = generate_answer(question, context)
224
 
 
225
  st.subheader("πŸ“ Answer")
226
- st.write(answer)
227
 
 
228
  st.subheader("πŸ“š Sources")
229
- for r in results:
230
- with st.expander(r["source"]):
231
- st.write(r["content"][:300])
 
 
232
 
 
233
  st.divider()
234
- st.caption("Smart RAG API - FAISS + HuggingFace")
 
1
  import streamlit as st
2
  import os
3
+ import re
4
  import pdfplumber
5
  from io import BytesIO
 
6
  from docx import Document
7
  import pandas as pd
8
  import numpy as np
9
  import faiss
 
10
  from huggingface_hub import InferenceClient
11
 
12
+ # ============================================
13
+ # SMART RAG API - HuggingFace Space Version
14
+ # Technologies: FastAPI, FAISS, HuggingFace Hub
15
+ # Parsers: pdfplumber, python-docx, pandas
16
+ # ============================================
17
+
18
  # ============== CONFIG ==============
19
  CHUNK_SIZE = 500
20
  CHUNK_OVERLAP = 50
21
+ EMBEDDING_DIM = 384
22
 
23
+ # ============== TEXT CHUNKING ==============
24
+ def chunk_text(text):
25
+ """Convert text into clean, meaningful chunks with overlap."""
26
  if not text or not text.strip():
27
  return []
28
 
 
35
  end = start + CHUNK_SIZE
36
  chunk_content = text[start:end]
37
 
38
+ # Try to break at sentence boundary
39
  if end < len(text):
40
  last_period = chunk_content.rfind(". ")
41
  if last_period > CHUNK_SIZE * 0.5:
42
  chunk_content = chunk_content[:last_period + 1]
43
  end = start + last_period + 1
44
 
45
+ chunks.append({
46
+ "content": chunk_content.strip(),
47
+ "chunk_index": chunk_index
48
+ })
49
  chunk_index += 1
50
  start = end - CHUNK_OVERLAP
51
 
 
55
  return chunks
56
 
57
  # ============== DOCUMENT PARSERS ==============
58
+ def parse_pdf(file_bytes):
59
+ """.pdf via pdfplumber"""
60
  text_parts = []
61
  with pdfplumber.open(BytesIO(file_bytes)) as pdf:
62
  for i, page in enumerate(pdf.pages):
 
65
  text_parts.append(f"[Page {i + 1}]\n{page_text}")
66
  return "\n\n".join(text_parts)
67
 
68
+ def parse_docx(file_bytes):
69
+ """.docx via python-docx"""
70
  doc = Document(BytesIO(file_bytes))
71
  paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
72
  return "\n\n".join(paragraphs)
73
 
74
+ def parse_txt(file_bytes):
75
+ """.txt directly"""
76
  return file_bytes.decode("utf-8")
77
 
78
+ def parse_csv(file_bytes):
79
+ """.csv using pandas"""
 
 
80
  df = pd.read_csv(BytesIO(file_bytes))
81
+ lines = [
82
+ f"Columns: {', '.join(df.columns.tolist())}",
83
+ f"Total rows: {len(df)}",
84
+ "\nData:"
85
+ ]
86
  for idx, row in df.head(50).iterrows():
87
  row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
88
  lines.append(row_text)
89
  return "\n".join(lines)
90
 
91
+ def parse_document(file_bytes, filename):
92
+ """Parse document and return chunks with metadata."""
93
  ext = filename.split(".")[-1].lower()
94
 
95
  if ext == "pdf":
 
98
  text = parse_docx(file_bytes)
99
  elif ext == "txt":
100
  text = parse_txt(file_bytes)
 
 
101
  elif ext == "csv":
102
  text = parse_csv(file_bytes)
103
  else:
104
+ text = f"[Unsupported file type: {ext}]"
105
 
106
  chunks = chunk_text(text)
107
+
108
+ # Add metadata (filename, chunk index)
109
  for chunk in chunks:
110
  chunk["source"] = filename
111
  chunk["file_type"] = ext
112
 
113
  return {"text": text, "chunks": chunks}
114
 
115
+ # ============== EMBEDDINGS (HuggingFace style) ==============
116
+ def simple_tokenize(text):
117
+ """Simple word tokenization."""
118
+ text = text.lower()
119
+ tokens = re.findall(r'\b[a-z]+\b', text)
120
+ return tokens
121
+
122
+ def hash_embed(text, dim=EMBEDDING_DIM):
123
+ """Simple hash-based embedding (lightweight alternative to sentence-transformers)."""
124
+ tokens = simple_tokenize(text)
125
+ vector = np.zeros(dim)
126
+
127
+ for token in tokens:
128
+ idx = hash(token) % dim
129
+ vector[idx] += 1
130
+
131
+ # Normalize
132
+ norm = np.linalg.norm(vector)
133
+ if norm > 0:
134
+ vector = vector / norm
135
+
136
+ return vector
137
 
138
+ def embed_texts(texts):
139
+ """Generate embeddings for multiple texts."""
140
+ return np.array([hash_embed(t) for t in texts]).astype("float32")
141
 
142
+ # ============== VECTOR STORE (FAISS) ==============
143
+ class VectorStore:
144
+ """Store embeddings in FAISS for similarity search."""
145
+
146
  def __init__(self):
147
  self.index = None
148
  self.documents = []
 
149
 
150
+ def add_documents(self, chunks):
151
+ """Add document chunks to FAISS index."""
152
  if not chunks:
153
  return 0
154
 
155
  texts = [c["content"] for c in chunks]
156
+ embeddings = embed_texts(texts)
157
 
158
  if self.index is None:
159
+ self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
160
 
161
  self.index.add(embeddings)
162
  self.documents.extend(chunks)
163
  return len(chunks)
164
 
165
+ def search(self, query, top_k=5):
166
+ """Perform similarity search."""
167
  if self.index is None or self.index.ntotal == 0:
168
  return []
169
 
170
+ query_embedding = embed_texts([query])
171
  distances, indices = self.index.search(query_embedding, top_k)
172
 
173
  results = []
 
179
  return results
180
 
181
  def clear(self):
182
+ """Clear all documents."""
183
  self.index = None
184
  self.documents = []
185
+
186
+ def get_stats(self):
187
+ """Get store statistics."""
188
+ return {
189
+ "total_documents": len(self.documents),
190
+ "index_size": self.index.ntotal if self.index else 0
191
+ }
192
 
193
+ # ============== LLM SERVICE (HuggingFace Hub) ==============
 
194
  def get_llm_client():
195
+ """Get HuggingFace Inference Client."""
196
+ token = os.getenv("HUGGINGFACE_API_KEY", "")
197
+ if not token:
198
+ try:
199
+ token = st.secrets["HUGGINGFACE_API_KEY"]
200
+ except:
201
+ token = ""
202
+ return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=token if token else None)
203
+
204
+ def generate_answer(question, context):
205
+ """Send prompt to LLM and return answer."""
206
+ prompt = f"""You are a helpful assistant that answers questions based on the provided context.
207
 
 
 
208
  CONTEXT:
209
  {context}
210
+
211
+ INSTRUCTIONS:
212
+ - Answer the question based ONLY on the context provided above.
213
+ - If the context doesn't contain enough information, say so.
214
+ - Be concise and direct.
215
+ - Mention which source the information comes from if relevant.
216
+
217
  QUESTION: {question}
218
+
219
  ANSWER:"""
220
 
221
  try:
 
229
  except Exception as e:
230
  return f"Error: {str(e)}"
231
 
232
+ # ============== STREAMLIT UI ==============
233
+ st.set_page_config(
234
+ page_title="Smart RAG API",
235
+ page_icon="πŸ”",
236
+ layout="wide"
237
+ )
238
 
239
  st.title("πŸ” Smart RAG API")
240
+ st.markdown("""
241
+ **Retrieval-Augmented Generation API** - Upload documents and ask questions!
242
+
243
+ **Technologies:** FastAPI β€’ FAISS β€’ pdfplumber β€’ python-docx β€’ pandas β€’ HuggingFace Hub
244
+ """)
245
 
246
+ # Initialize vector store
247
  if "vector_store" not in st.session_state:
248
+ st.session_state.vector_store = VectorStore()
249
 
250
  # Sidebar
251
  with st.sidebar:
252
  st.header("πŸ“Š Status")
253
+ stats = st.session_state.vector_store.get_stats()
254
  st.success("βœ… Running")
255
+ st.metric("Documents in Store", stats["total_documents"])
256
+ st.metric("Index Size", stats["index_size"])
257
+
258
+ st.divider()
259
 
260
+ if st.button("πŸ—‘οΈ Clear All Documents"):
261
  st.session_state.vector_store.clear()
262
+ st.success("Cleared!")
263
  st.rerun()
264
 
265
  st.divider()
266
+ st.markdown("### πŸ“ Supported Files")
267
+ st.markdown("""
268
+ - πŸ“• **PDF** (pdfplumber)
269
+ - πŸ“ **DOCX** (python-docx)
270
+ - πŸ“„ **TXT** (direct)
271
+ - πŸ“Š **CSV** (pandas)
272
+ """)
273
+
274
+ st.divider()
275
+ st.markdown("### πŸ› οΈ Tech Stack")
276
+ st.markdown("""
277
+ - **Vector Store:** FAISS
278
+ - **LLM:** HuggingFace Hub
279
+ - **Embeddings:** Custom (lightweight)
280
+ - **UI:** Streamlit
281
+ """)
282
 
283
+ # Main layout
284
  col1, col2 = st.columns(2)
285
 
286
+ # Upload Section
287
  with col1:
288
+ st.header("πŸ“€ Upload Document")
289
+
290
+ uploaded_file = st.file_uploader(
291
+ "Choose a file",
292
+ type=["pdf", "docx", "txt", "csv"],
293
+ help="Supported: PDF, DOCX, TXT, CSV"
294
+ )
295
+
296
+ if uploaded_file:
297
+ file_icon = {"pdf": "πŸ“•", "docx": "πŸ“", "txt": "πŸ“„", "csv": "πŸ“Š"}
298
+ ext = uploaded_file.name.split(".")[-1].lower()
299
+ st.info(f"{file_icon.get(ext, 'πŸ“')} **{uploaded_file.name}** ({uploaded_file.size} bytes)")
300
+
301
+ if st.button("πŸ“€ Process Document", type="primary"):
302
+ with st.spinner("Processing document..."):
303
+ try:
304
+ file_bytes = uploaded_file.getvalue()
305
+ parsed = parse_document(file_bytes, uploaded_file.name)
306
+ added = st.session_state.vector_store.add_documents(parsed["chunks"])
307
+ st.success(f"βœ… Success! Added **{added} chunks** to knowledge base.")
308
+ st.json({
309
+ "filename": uploaded_file.name,
310
+ "file_type": ext,
311
+ "chunks_created": added
312
+ })
313
+ except Exception as e:
314
+ st.error(f"❌ Error: {str(e)}")
315
 
316
+ # Query Section
317
  with col2:
318
+ st.header("πŸ’¬ Ask Questions")
 
 
319
 
320
+ question = st.text_area(
321
+ "Your question:",
322
+ placeholder="What is this document about?",
323
+ height=100
324
+ )
325
+
326
+ top_k = st.slider("Number of sources to retrieve", 1, 10, 3)
327
+
328
+ if st.button("πŸ” Search & Answer", type="primary"):
329
  if not question:
330
+ st.warning("⚠️ Please enter a question")
331
+ elif st.session_state.vector_store.get_stats()["total_documents"] == 0:
332
+ st.warning("⚠️ Please upload documents first")
333
  else:
334
+ with st.spinner("Searching and generating answer..."):
335
+ # Vector search
336
  results = st.session_state.vector_store.search(question, top_k)
337
+
338
  if results:
339
+ # Build context
340
+ context_parts = []
341
+ for i, r in enumerate(results, 1):
342
+ context_parts.append(f"[Source {i}: {r['source']}]\n{r['content']}")
343
+ context = "\n\n".join(context_parts)
344
+
345
+ # Generate answer via LLM
346
  answer = generate_answer(question, context)
347
 
348
+ # Display answer
349
  st.subheader("πŸ“ Answer")
350
+ st.markdown(answer)
351
 
352
+ # Display sources
353
  st.subheader("πŸ“š Sources")
354
+ for i, r in enumerate(results, 1):
355
+ with st.expander(f"Source {i}: {r['source']} (score: {r['score']:.3f})"):
356
+ st.write(r["content"][:500] + "..." if len(r["content"]) > 500 else r["content"])
357
+ else:
358
+ st.warning("No relevant documents found.")
359
 
360
+ # Footer
361
  st.divider()
362
+ st.caption("πŸš€ **Smart RAG API** | Built with FAISS, HuggingFace Hub, pdfplumber, python-docx, pandas | By Emon Karmoker")