EdwardConstantine commited on
Commit
3170857
Β·
verified Β·
1 Parent(s): 5bb3eca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -314
app.py CHANGED
@@ -1,314 +1,243 @@
1
- import streamlit as st
2
- import os
3
- import uuid
4
- import base64
5
- import tempfile
6
- from pathlib import Path
7
- from io import BytesIO
8
-
9
- # Set up environment
10
- os.environ["EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"
11
- os.environ["LLM_PROVIDER"] = "huggingface"
12
- os.environ["HUGGINGFACE_MODEL"] = "HuggingFaceH4/zephyr-7b-beta"
13
-
14
- # Import after setting env
15
- import fitz # PyMuPDF
16
- import pytesseract
17
- from PIL import Image
18
- from docx import Document
19
- import pandas as pd
20
- import sqlite3
21
- import numpy as np
22
- import faiss
23
- from sentence_transformers import SentenceTransformer
24
- from huggingface_hub import InferenceClient
25
-
26
- # ============== CONFIG ==============
27
- CHUNK_SIZE = 500
28
- CHUNK_OVERLAP = 50
29
- SUPPORTED_EXTENSIONS = [".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png", ".csv", ".db"]
30
-
31
- # ============== TEXT PROCESSING ==============
32
- def chunk_text(text: str) -> list[dict]:
33
- if not text or not text.strip():
34
- return []
35
-
36
- text = " ".join(text.strip().split())
37
- chunks = []
38
- start = 0
39
- chunk_index = 0
40
-
41
- while start < len(text):
42
- end = start + CHUNK_SIZE
43
- chunk_content = text[start:end]
44
-
45
- if end < len(text):
46
- last_period = chunk_content.rfind(". ")
47
- if last_period > CHUNK_SIZE * 0.5:
48
- chunk_content = chunk_content[:last_period + 1]
49
- end = start + last_period + 1
50
-
51
- chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
52
- chunk_index += 1
53
- start = end - CHUNK_OVERLAP
54
-
55
- if start >= len(text) - CHUNK_OVERLAP:
56
- break
57
-
58
- return chunks
59
-
60
- # ============== DOCUMENT PARSER ==============
61
- def parse_pdf(file_bytes) -> str:
62
- text_parts = []
63
- doc = fitz.open(stream=file_bytes, filetype="pdf")
64
- for page_num, page in enumerate(doc):
65
- page_text = page.get_text()
66
- if not page_text.strip():
67
- pix = page.get_pixmap()
68
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
69
- try:
70
- page_text = pytesseract.image_to_string(img)
71
- except:
72
- page_text = ""
73
- if page_text.strip():
74
- text_parts.append(f"[Page {page_num + 1}]\n{page_text}")
75
- doc.close()
76
- return "\n\n".join(text_parts)
77
-
78
- def parse_docx(file_bytes) -> str:
79
- doc = Document(BytesIO(file_bytes))
80
- paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
81
- return "\n\n".join(paragraphs)
82
-
83
- def parse_txt(file_bytes) -> str:
84
- return file_bytes.decode("utf-8")
85
-
86
- def parse_image(file_bytes) -> str:
87
- img = Image.open(BytesIO(file_bytes))
88
- try:
89
- text = pytesseract.image_to_string(img)
90
- except:
91
- text = "[OCR not available]"
92
- return text
93
-
94
- def parse_csv(file_bytes) -> str:
95
- df = pd.read_csv(BytesIO(file_bytes))
96
- lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
97
- for idx, row in df.iterrows():
98
- row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
99
- lines.append(row_text)
100
- return "\n".join(lines)
101
-
102
- def parse_document(file_bytes, filename) -> dict:
103
- ext = Path(filename).suffix.lower()
104
-
105
- if ext == ".pdf":
106
- text = parse_pdf(file_bytes)
107
- elif ext == ".docx":
108
- text = parse_docx(file_bytes)
109
- elif ext == ".txt":
110
- text = parse_txt(file_bytes)
111
- elif ext in [".jpg", ".jpeg", ".png"]:
112
- text = parse_image(file_bytes)
113
- elif ext == ".csv":
114
- text = parse_csv(file_bytes)
115
- else:
116
- text = ""
117
-
118
- chunks = chunk_text(text)
119
- for chunk in chunks:
120
- chunk["source"] = filename
121
- chunk["file_type"] = ext
122
-
123
- return {"text": text, "chunks": chunks, "metadata": {"filename": filename, "file_type": ext, "total_chunks": len(chunks)}}
124
-
125
- # ============== EMBEDDING SERVICE ==============
126
- @st.cache_resource
127
- def load_embedding_model():
128
- return SentenceTransformer("all-MiniLM-L6-v2")
129
-
130
- def embed_texts(texts: list[str]) -> np.ndarray:
131
- model = load_embedding_model()
132
- return model.encode(texts)
133
-
134
- # ============== VECTOR STORE ==============
135
- class SimpleVectorStore:
136
- def __init__(self):
137
- self.index = None
138
- self.documents = []
139
- self.dimension = 384 # all-MiniLM-L6-v2 dimension
140
-
141
- def add_documents(self, chunks: list[dict]):
142
- if not chunks:
143
- return 0
144
-
145
- texts = [c["content"] for c in chunks]
146
- embeddings = embed_texts(texts).astype("float32")
147
-
148
- if self.index is None:
149
- self.index = faiss.IndexFlatL2(self.dimension)
150
-
151
- self.index.add(embeddings)
152
- self.documents.extend(chunks)
153
- return len(chunks)
154
-
155
- def search(self, query: str, top_k: int = 5) -> list[dict]:
156
- if self.index is None or self.index.ntotal == 0:
157
- return []
158
-
159
- query_embedding = embed_texts([query]).astype("float32")
160
- distances, indices = self.index.search(query_embedding, top_k)
161
-
162
- results = []
163
- for i, idx in enumerate(indices[0]):
164
- if 0 <= idx < len(self.documents):
165
- doc = self.documents[idx].copy()
166
- doc["score"] = float(distances[0][i])
167
- results.append(doc)
168
- return results
169
-
170
- def clear(self):
171
- self.index = None
172
- self.documents = []
173
-
174
- # ============== LLM SERVICE ==============
175
- @st.cache_resource
176
- def get_llm_client():
177
- return InferenceClient(
178
- model="HuggingFaceH4/zephyr-7b-beta",
179
- token=os.getenv("HUGGINGFACE_API_KEY", st.secrets.get("HUGGINGFACE_API_KEY", ""))
180
- )
181
-
182
- def generate_answer(question: str, context: str) -> str:
183
- prompt = f"""You are a helpful assistant that answers questions based on the provided context.
184
-
185
- CONTEXT:
186
- {context}
187
-
188
- INSTRUCTIONS:
189
- - Answer the question based ONLY on the context provided above.
190
- - If the context doesn't contain enough information, say "I don't have enough information."
191
- - Be concise and direct.
192
-
193
- QUESTION: {question}
194
-
195
- ANSWER:"""
196
-
197
- try:
198
- client = get_llm_client()
199
- response = client.chat_completion(
200
- messages=[{"role": "user", "content": prompt}],
201
- max_tokens=512,
202
- temperature=0.7
203
- )
204
- return response.choices[0].message.content
205
- except Exception as e:
206
- return f"Error generating answer: {str(e)}"
207
-
208
- # ============== STREAMLIT APP ==============
209
- st.set_page_config(page_title="Smart RAG API", page_icon="πŸ”", layout="wide")
210
-
211
- st.title("πŸ” Smart RAG API")
212
- st.markdown("Upload documents and ask questions about them - Powered by LangChain & HuggingFace")
213
-
214
- # Initialize vector store in session state
215
- if "vector_store" not in st.session_state:
216
- st.session_state.vector_store = SimpleVectorStore()
217
-
218
- # Sidebar
219
- with st.sidebar:
220
- st.header("πŸ“Š Status")
221
- st.success("βœ… App Running")
222
- st.metric("Documents", len(st.session_state.vector_store.documents))
223
-
224
- st.divider()
225
-
226
- if st.button("πŸ—‘οΈ Clear All Documents"):
227
- st.session_state.vector_store.clear()
228
- st.success("Cleared!")
229
- st.rerun()
230
-
231
- st.divider()
232
- st.markdown("### ℹ️ Supported Files")
233
- st.markdown("πŸ“„ PDF, πŸ“ DOCX, TXT, πŸ–ΌοΈ JPG, PNG, πŸ“Š CSV")
234
-
235
- # Main content
236
- col1, col2 = st.columns(2)
237
-
238
- # Upload section
239
- with col1:
240
- st.header("πŸ“ Upload Document")
241
-
242
- uploaded_file = st.file_uploader(
243
- "Choose a file",
244
- type=["pdf", "docx", "txt", "jpg", "jpeg", "png", "csv"],
245
- help="Supported: PDF, DOCX, TXT, Images, CSV"
246
- )
247
-
248
- if uploaded_file:
249
- if st.button("πŸ“€ Upload & Process", type="primary"):
250
- with st.spinner("Processing document..."):
251
- try:
252
- file_bytes = uploaded_file.getvalue()
253
- parsed = parse_document(file_bytes, uploaded_file.name)
254
- chunks_added = st.session_state.vector_store.add_documents(parsed["chunks"])
255
- st.success(f"βœ… Added {chunks_added} chunks from {uploaded_file.name}")
256
- except Exception as e:
257
- st.error(f"Error: {str(e)}")
258
-
259
- # Query section
260
- with col2:
261
- st.header("πŸ’¬ Ask Questions")
262
-
263
- question = st.text_area("Your question:", placeholder="What is this document about?", height=100)
264
-
265
- with st.expander("πŸ“· Add Image for OCR (Optional)"):
266
- image_file = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"], key="img")
267
- if image_file:
268
- st.image(image_file, width=200)
269
-
270
- top_k = st.slider("Number of sources", 1, 10, 3)
271
-
272
- if st.button("πŸ” Search & Answer", type="primary"):
273
- if not question:
274
- st.warning("Please enter a question")
275
- elif len(st.session_state.vector_store.documents) == 0:
276
- st.warning("Please upload documents first")
277
- else:
278
- with st.spinner("Searching and generating answer..."):
279
- # Handle image OCR if provided
280
- image_text = ""
281
- if image_file:
282
- try:
283
- img_bytes = image_file.getvalue()
284
- image_text = parse_image(img_bytes)
285
- except:
286
- pass
287
-
288
- # Search
289
- search_query = f"{question} {image_text[:200]}" if image_text else question
290
- results = st.session_state.vector_store.search(search_query, top_k)
291
-
292
- if results:
293
- # Build context
294
- context = "\n\n".join([f"[Source: {r['source']}]\n{r['content']}" for r in results])
295
-
296
- # Generate answer
297
- answer = generate_answer(question, context)
298
-
299
- st.subheader("πŸ“ Answer")
300
- st.markdown(answer)
301
-
302
- if image_text:
303
- st.subheader("πŸ–ΌοΈ Text from Image")
304
- st.text(image_text[:500])
305
-
306
- st.subheader("πŸ“š Sources")
307
- for i, r in enumerate(results, 1):
308
- with st.expander(f"Source {i}: {r['source']}"):
309
- st.write(r["content"][:300] + "...")
310
- else:
311
- st.warning("No relevant documents found")
312
-
313
- st.divider()
314
- st.caption("Built with FastAPI, FAISS, LangChain, SentenceTransformers & HuggingFace | 100% Free")
 
1
+ import streamlit as st
2
+ import os
3
+ import pdfplumber
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from docx import Document
7
+ import pandas as pd
8
+ import numpy as np
9
+ import faiss
10
+ from sentence_transformers import SentenceTransformer
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # ============== CONFIG ==============
14
+ CHUNK_SIZE = 500
15
+ CHUNK_OVERLAP = 50
16
+
17
+ # ============== TEXT PROCESSING ==============
18
+ def chunk_text(text: str) -> list[dict]:
19
+ if not text or not text.strip():
20
+ return []
21
+
22
+ text = " ".join(text.strip().split())
23
+ chunks = []
24
+ start = 0
25
+ chunk_index = 0
26
+
27
+ while start < len(text):
28
+ end = start + CHUNK_SIZE
29
+ chunk_content = text[start:end]
30
+
31
+ if end < len(text):
32
+ last_period = chunk_content.rfind(". ")
33
+ if last_period > CHUNK_SIZE * 0.5:
34
+ chunk_content = chunk_content[:last_period + 1]
35
+ end = start + last_period + 1
36
+
37
+ chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
38
+ chunk_index += 1
39
+ start = end - CHUNK_OVERLAP
40
+
41
+ if start >= len(text) - CHUNK_OVERLAP:
42
+ break
43
+
44
+ return chunks
45
+
46
+ # ============== DOCUMENT PARSERS ==============
47
+ def parse_pdf(file_bytes) -> str:
48
+ text_parts = []
49
+ with pdfplumber.open(BytesIO(file_bytes)) as pdf:
50
+ for i, page in enumerate(pdf.pages):
51
+ page_text = page.extract_text() or ""
52
+ if page_text.strip():
53
+ text_parts.append(f"[Page {i + 1}]\n{page_text}")
54
+ return "\n\n".join(text_parts)
55
+
56
+ def parse_docx(file_bytes) -> str:
57
+ doc = Document(BytesIO(file_bytes))
58
+ paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
59
+ return "\n\n".join(paragraphs)
60
+
61
+ def parse_txt(file_bytes) -> str:
62
+ return file_bytes.decode("utf-8")
63
+
64
+ def parse_image(file_bytes) -> str:
65
+ return "[Image uploaded - OCR not available in cloud version]"
66
+
67
+ def parse_csv(file_bytes) -> str:
68
+ df = pd.read_csv(BytesIO(file_bytes))
69
+ lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
70
+ for idx, row in df.head(50).iterrows():
71
+ row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
72
+ lines.append(row_text)
73
+ return "\n".join(lines)
74
+
75
+ def parse_document(file_bytes, filename) -> dict:
76
+ ext = filename.split(".")[-1].lower()
77
+
78
+ if ext == "pdf":
79
+ text = parse_pdf(file_bytes)
80
+ elif ext == "docx":
81
+ text = parse_docx(file_bytes)
82
+ elif ext == "txt":
83
+ text = parse_txt(file_bytes)
84
+ elif ext in ["jpg", "jpeg", "png"]:
85
+ text = parse_image(file_bytes)
86
+ elif ext == "csv":
87
+ text = parse_csv(file_bytes)
88
+ else:
89
+ text = ""
90
+
91
+ chunks = chunk_text(text)
92
+ for chunk in chunks:
93
+ chunk["source"] = filename
94
+ chunk["file_type"] = ext
95
+
96
+ return {"text": text, "chunks": chunks}
97
+
98
+ # ============== EMBEDDING SERVICE ==============
99
+ @st.cache_resource
100
+ def load_embedding_model():
101
+ return SentenceTransformer("all-MiniLM-L6-v2")
102
+
103
+ def embed_texts(texts: list[str]) -> np.ndarray:
104
+ model = load_embedding_model()
105
+ return model.encode(texts)
106
+
107
+ # ============== VECTOR STORE ==============
108
+ class SimpleVectorStore:
109
+ def __init__(self):
110
+ self.index = None
111
+ self.documents = []
112
+ self.dimension = 384
113
+
114
+ def add_documents(self, chunks: list[dict]):
115
+ if not chunks:
116
+ return 0
117
+
118
+ texts = [c["content"] for c in chunks]
119
+ embeddings = embed_texts(texts).astype("float32")
120
+
121
+ if self.index is None:
122
+ self.index = faiss.IndexFlatL2(self.dimension)
123
+
124
+ self.index.add(embeddings)
125
+ self.documents.extend(chunks)
126
+ return len(chunks)
127
+
128
+ def search(self, query: str, top_k: int = 5) -> list[dict]:
129
+ if self.index is None or self.index.ntotal == 0:
130
+ return []
131
+
132
+ query_embedding = embed_texts([query]).astype("float32")
133
+ distances, indices = self.index.search(query_embedding, top_k)
134
+
135
+ results = []
136
+ for i, idx in enumerate(indices[0]):
137
+ if 0 <= idx < len(self.documents):
138
+ doc = self.documents[idx].copy()
139
+ doc["score"] = float(distances[0][i])
140
+ results.append(doc)
141
+ return results
142
+
143
+ def clear(self):
144
+ self.index = None
145
+ self.documents = []
146
+
147
+ # ============== LLM SERVICE ==============
148
+ @st.cache_resource
149
+ def get_llm_client():
150
+ token = os.getenv("HUGGINGFACE_API_KEY", "")
151
+ if not token:
152
+ try:
153
+ token = st.secrets["HUGGINGFACE_API_KEY"]
154
+ except:
155
+ token = ""
156
+ return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=token)
157
+
158
+ def generate_answer(question: str, context: str) -> str:
159
+ prompt = f"""You are a helpful assistant. Answer based on the context below.
160
+
161
+ CONTEXT:
162
+ {context}
163
+
164
+ QUESTION: {question}
165
+
166
+ ANSWER:"""
167
+
168
+ try:
169
+ client = get_llm_client()
170
+ response = client.chat_completion(
171
+ messages=[{"role": "user", "content": prompt}],
172
+ max_tokens=512,
173
+ temperature=0.7
174
+ )
175
+ return response.choices[0].message.content
176
+ except Exception as e:
177
+ return f"Error: {str(e)}"
178
+
179
+ # ============== STREAMLIT APP ==============
180
+ st.set_page_config(page_title="Smart RAG API", page_icon="πŸ”", layout="wide")
181
+
182
+ st.title("πŸ” Smart RAG API")
183
+ st.markdown("Upload documents and ask questions - Powered by HuggingFace")
184
+
185
+ if "vector_store" not in st.session_state:
186
+ st.session_state.vector_store = SimpleVectorStore()
187
+
188
+ # Sidebar
189
+ with st.sidebar:
190
+ st.header("πŸ“Š Status")
191
+ st.success("βœ… Running")
192
+ st.metric("Documents", len(st.session_state.vector_store.documents))
193
+
194
+ if st.button("πŸ—‘οΈ Clear All"):
195
+ st.session_state.vector_store.clear()
196
+ st.rerun()
197
+
198
+ st.divider()
199
+ st.markdown("**Supported:** PDF, DOCX, TXT, CSV")
200
+
201
+ # Main columns
202
+ col1, col2 = st.columns(2)
203
+
204
+ with col1:
205
+ st.header("πŸ“ Upload")
206
+ uploaded_file = st.file_uploader("Choose file", type=["pdf", "docx", "txt", "csv"])
207
+
208
+ if uploaded_file and st.button("πŸ“€ Process", type="primary"):
209
+ with st.spinner("Processing..."):
210
+ try:
211
+ parsed = parse_document(uploaded_file.getvalue(), uploaded_file.name)
212
+ added = st.session_state.vector_store.add_documents(parsed["chunks"])
213
+ st.success(f"βœ… Added {added} chunks")
214
+ except Exception as e:
215
+ st.error(f"Error: {e}")
216
+
217
+ with col2:
218
+ st.header("πŸ’¬ Ask")
219
+ question = st.text_area("Question:", placeholder="What is this about?")
220
+ top_k = st.slider("Sources", 1, 5, 3)
221
+
222
+ if st.button("πŸ” Answer", type="primary"):
223
+ if not question:
224
+ st.warning("Enter a question")
225
+ elif not st.session_state.vector_store.documents:
226
+ st.warning("Upload documents first")
227
+ else:
228
+ with st.spinner("Thinking..."):
229
+ results = st.session_state.vector_store.search(question, top_k)
230
+ if results:
231
+ context = "\n\n".join([f"[{r['source']}]: {r['content']}" for r in results])
232
+ answer = generate_answer(question, context)
233
+
234
+ st.subheader("πŸ“ Answer")
235
+ st.write(answer)
236
+
237
+ st.subheader("πŸ“š Sources")
238
+ for r in results:
239
+ with st.expander(r["source"]):
240
+ st.write(r["content"][:300])
241
+
242
+ st.divider()
243
+ st.caption("Smart RAG API - FAISS + HuggingFace")