EdwardConstantine commited on
Commit
f86e562
Β·
verified Β·
1 Parent(s): 7ae61ba

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +314 -0
  2. requirements.txt +9 -3
app.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import uuid
4
+ import base64
5
+ import tempfile
6
+ from pathlib import Path
7
+ from io import BytesIO
8
+
9
+ # Set up environment
10
+ os.environ["EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"
11
+ os.environ["LLM_PROVIDER"] = "huggingface"
12
+ os.environ["HUGGINGFACE_MODEL"] = "HuggingFaceH4/zephyr-7b-beta"
13
+
14
+ # Import after setting env
15
+ import fitz # PyMuPDF
16
+ import pytesseract
17
+ from PIL import Image
18
+ from docx import Document
19
+ import pandas as pd
20
+ import sqlite3
21
+ import numpy as np
22
+ import faiss
23
+ from sentence_transformers import SentenceTransformer
24
+ from huggingface_hub import InferenceClient
25
+
26
+ # ============== CONFIG ==============
27
+ CHUNK_SIZE = 500
28
+ CHUNK_OVERLAP = 50
29
+ SUPPORTED_EXTENSIONS = [".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png", ".csv", ".db"]
30
+
31
+ # ============== TEXT PROCESSING ==============
32
+ def chunk_text(text: str) -> list[dict]:
33
+ if not text or not text.strip():
34
+ return []
35
+
36
+ text = " ".join(text.strip().split())
37
+ chunks = []
38
+ start = 0
39
+ chunk_index = 0
40
+
41
+ while start < len(text):
42
+ end = start + CHUNK_SIZE
43
+ chunk_content = text[start:end]
44
+
45
+ if end < len(text):
46
+ last_period = chunk_content.rfind(". ")
47
+ if last_period > CHUNK_SIZE * 0.5:
48
+ chunk_content = chunk_content[:last_period + 1]
49
+ end = start + last_period + 1
50
+
51
+ chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
52
+ chunk_index += 1
53
+ start = end - CHUNK_OVERLAP
54
+
55
+ if start >= len(text) - CHUNK_OVERLAP:
56
+ break
57
+
58
+ return chunks
59
+
60
+ # ============== DOCUMENT PARSER ==============
61
+ def parse_pdf(file_bytes) -> str:
62
+ text_parts = []
63
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
64
+ for page_num, page in enumerate(doc):
65
+ page_text = page.get_text()
66
+ if not page_text.strip():
67
+ pix = page.get_pixmap()
68
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
69
+ try:
70
+ page_text = pytesseract.image_to_string(img)
71
+ except:
72
+ page_text = ""
73
+ if page_text.strip():
74
+ text_parts.append(f"[Page {page_num + 1}]\n{page_text}")
75
+ doc.close()
76
+ return "\n\n".join(text_parts)
77
+
78
+ def parse_docx(file_bytes) -> str:
79
+ doc = Document(BytesIO(file_bytes))
80
+ paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
81
+ return "\n\n".join(paragraphs)
82
+
83
+ def parse_txt(file_bytes) -> str:
84
+ return file_bytes.decode("utf-8")
85
+
86
+ def parse_image(file_bytes) -> str:
87
+ img = Image.open(BytesIO(file_bytes))
88
+ try:
89
+ text = pytesseract.image_to_string(img)
90
+ except:
91
+ text = "[OCR not available]"
92
+ return text
93
+
94
+ def parse_csv(file_bytes) -> str:
95
+ df = pd.read_csv(BytesIO(file_bytes))
96
+ lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
97
+ for idx, row in df.iterrows():
98
+ row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
99
+ lines.append(row_text)
100
+ return "\n".join(lines)
101
+
102
+ def parse_document(file_bytes, filename) -> dict:
103
+ ext = Path(filename).suffix.lower()
104
+
105
+ if ext == ".pdf":
106
+ text = parse_pdf(file_bytes)
107
+ elif ext == ".docx":
108
+ text = parse_docx(file_bytes)
109
+ elif ext == ".txt":
110
+ text = parse_txt(file_bytes)
111
+ elif ext in [".jpg", ".jpeg", ".png"]:
112
+ text = parse_image(file_bytes)
113
+ elif ext == ".csv":
114
+ text = parse_csv(file_bytes)
115
+ else:
116
+ text = ""
117
+
118
+ chunks = chunk_text(text)
119
+ for chunk in chunks:
120
+ chunk["source"] = filename
121
+ chunk["file_type"] = ext
122
+
123
+ return {"text": text, "chunks": chunks, "metadata": {"filename": filename, "file_type": ext, "total_chunks": len(chunks)}}
124
+
125
+ # ============== EMBEDDING SERVICE ==============
126
+ @st.cache_resource
127
+ def load_embedding_model():
128
+ return SentenceTransformer("all-MiniLM-L6-v2")
129
+
130
+ def embed_texts(texts: list[str]) -> np.ndarray:
131
+ model = load_embedding_model()
132
+ return model.encode(texts)
133
+
134
+ # ============== VECTOR STORE ==============
135
+ class SimpleVectorStore:
136
+ def __init__(self):
137
+ self.index = None
138
+ self.documents = []
139
+ self.dimension = 384 # all-MiniLM-L6-v2 dimension
140
+
141
+ def add_documents(self, chunks: list[dict]):
142
+ if not chunks:
143
+ return 0
144
+
145
+ texts = [c["content"] for c in chunks]
146
+ embeddings = embed_texts(texts).astype("float32")
147
+
148
+ if self.index is None:
149
+ self.index = faiss.IndexFlatL2(self.dimension)
150
+
151
+ self.index.add(embeddings)
152
+ self.documents.extend(chunks)
153
+ return len(chunks)
154
+
155
+ def search(self, query: str, top_k: int = 5) -> list[dict]:
156
+ if self.index is None or self.index.ntotal == 0:
157
+ return []
158
+
159
+ query_embedding = embed_texts([query]).astype("float32")
160
+ distances, indices = self.index.search(query_embedding, top_k)
161
+
162
+ results = []
163
+ for i, idx in enumerate(indices[0]):
164
+ if 0 <= idx < len(self.documents):
165
+ doc = self.documents[idx].copy()
166
+ doc["score"] = float(distances[0][i])
167
+ results.append(doc)
168
+ return results
169
+
170
+ def clear(self):
171
+ self.index = None
172
+ self.documents = []
173
+
174
+ # ============== LLM SERVICE ==============
175
+ @st.cache_resource
176
+ def get_llm_client():
177
+ return InferenceClient(
178
+ model="HuggingFaceH4/zephyr-7b-beta",
179
+ token=os.getenv("HUGGINGFACE_API_KEY", st.secrets.get("HUGGINGFACE_API_KEY", ""))
180
+ )
181
+
182
+ def generate_answer(question: str, context: str) -> str:
183
+ prompt = f"""You are a helpful assistant that answers questions based on the provided context.
184
+
185
+ CONTEXT:
186
+ {context}
187
+
188
+ INSTRUCTIONS:
189
+ - Answer the question based ONLY on the context provided above.
190
+ - If the context doesn't contain enough information, say "I don't have enough information."
191
+ - Be concise and direct.
192
+
193
+ QUESTION: {question}
194
+
195
+ ANSWER:"""
196
+
197
+ try:
198
+ client = get_llm_client()
199
+ response = client.chat_completion(
200
+ messages=[{"role": "user", "content": prompt}],
201
+ max_tokens=512,
202
+ temperature=0.7
203
+ )
204
+ return response.choices[0].message.content
205
+ except Exception as e:
206
+ return f"Error generating answer: {str(e)}"
207
+
208
+ # ============== STREAMLIT APP ==============
209
+ st.set_page_config(page_title="Smart RAG API", page_icon="πŸ”", layout="wide")
210
+
211
+ st.title("πŸ” Smart RAG API")
212
+ st.markdown("Upload documents and ask questions about them - Powered by LangChain & HuggingFace")
213
+
214
+ # Initialize vector store in session state
215
+ if "vector_store" not in st.session_state:
216
+ st.session_state.vector_store = SimpleVectorStore()
217
+
218
+ # Sidebar
219
+ with st.sidebar:
220
+ st.header("πŸ“Š Status")
221
+ st.success("βœ… App Running")
222
+ st.metric("Documents", len(st.session_state.vector_store.documents))
223
+
224
+ st.divider()
225
+
226
+ if st.button("πŸ—‘οΈ Clear All Documents"):
227
+ st.session_state.vector_store.clear()
228
+ st.success("Cleared!")
229
+ st.rerun()
230
+
231
+ st.divider()
232
+ st.markdown("### ℹ️ Supported Files")
233
+ st.markdown("πŸ“„ PDF, πŸ“ DOCX, TXT, πŸ–ΌοΈ JPG, PNG, πŸ“Š CSV")
234
+
235
+ # Main content
236
+ col1, col2 = st.columns(2)
237
+
238
+ # Upload section
239
+ with col1:
240
+ st.header("πŸ“ Upload Document")
241
+
242
+ uploaded_file = st.file_uploader(
243
+ "Choose a file",
244
+ type=["pdf", "docx", "txt", "jpg", "jpeg", "png", "csv"],
245
+ help="Supported: PDF, DOCX, TXT, Images, CSV"
246
+ )
247
+
248
+ if uploaded_file:
249
+ if st.button("πŸ“€ Upload & Process", type="primary"):
250
+ with st.spinner("Processing document..."):
251
+ try:
252
+ file_bytes = uploaded_file.getvalue()
253
+ parsed = parse_document(file_bytes, uploaded_file.name)
254
+ chunks_added = st.session_state.vector_store.add_documents(parsed["chunks"])
255
+ st.success(f"βœ… Added {chunks_added} chunks from {uploaded_file.name}")
256
+ except Exception as e:
257
+ st.error(f"Error: {str(e)}")
258
+
259
+ # Query section
260
+ with col2:
261
+ st.header("πŸ’¬ Ask Questions")
262
+
263
+ question = st.text_area("Your question:", placeholder="What is this document about?", height=100)
264
+
265
+ with st.expander("πŸ“· Add Image for OCR (Optional)"):
266
+ image_file = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"], key="img")
267
+ if image_file:
268
+ st.image(image_file, width=200)
269
+
270
+ top_k = st.slider("Number of sources", 1, 10, 3)
271
+
272
+ if st.button("πŸ” Search & Answer", type="primary"):
273
+ if not question:
274
+ st.warning("Please enter a question")
275
+ elif len(st.session_state.vector_store.documents) == 0:
276
+ st.warning("Please upload documents first")
277
+ else:
278
+ with st.spinner("Searching and generating answer..."):
279
+ # Handle image OCR if provided
280
+ image_text = ""
281
+ if image_file:
282
+ try:
283
+ img_bytes = image_file.getvalue()
284
+ image_text = parse_image(img_bytes)
285
+ except:
286
+ pass
287
+
288
+ # Search
289
+ search_query = f"{question} {image_text[:200]}" if image_text else question
290
+ results = st.session_state.vector_store.search(search_query, top_k)
291
+
292
+ if results:
293
+ # Build context
294
+ context = "\n\n".join([f"[Source: {r['source']}]\n{r['content']}" for r in results])
295
+
296
+ # Generate answer
297
+ answer = generate_answer(question, context)
298
+
299
+ st.subheader("πŸ“ Answer")
300
+ st.markdown(answer)
301
+
302
+ if image_text:
303
+ st.subheader("πŸ–ΌοΈ Text from Image")
304
+ st.text(image_text[:500])
305
+
306
+ st.subheader("πŸ“š Sources")
307
+ for i, r in enumerate(results, 1):
308
+ with st.expander(f"Source {i}: {r['source']}"):
309
+ st.write(r["content"][:300] + "...")
310
+ else:
311
+ st.warning("No relevant documents found")
312
+
313
+ st.divider()
314
+ st.caption("Built with FastAPI, FAISS, LangChain, SentenceTransformers & HuggingFace | 100% Free")
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ PyMuPDF==1.24.0
3
+ python-docx==1.1.0
4
+ pandas==2.2.0
5
+ pytesseract==0.3.10
6
+ Pillow==10.3.0
7
+ sentence-transformers==3.0.0
8
+ faiss-cpu==1.8.0
9
+ huggingface-hub==0.34.0