EdwardConstantine commited on
Commit
2811a96
Β·
verified Β·
1 Parent(s): 9805983

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +238 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,240 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import pdfplumber
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from docx import Document
7
+ import pandas as pd
8
+ import numpy as np
9
+ import faiss
10
+ from sentence_transformers import SentenceTransformer
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # ============== CONFIG ==============
14
+ CHUNK_SIZE = 500
15
+ CHUNK_OVERLAP = 50
16
+
17
+ # ============== TEXT PROCESSING ==============
18
+ def chunk_text(text: str) -> list[dict]:
19
+ if not text or not text.strip():
20
+ return []
21
+
22
+ text = " ".join(text.strip().split())
23
+ chunks = []
24
+ start = 0
25
+ chunk_index = 0
26
+
27
+ while start < len(text):
28
+ end = start + CHUNK_SIZE
29
+ chunk_content = text[start:end]
30
+
31
+ if end < len(text):
32
+ last_period = chunk_content.rfind(". ")
33
+ if last_period > CHUNK_SIZE * 0.5:
34
+ chunk_content = chunk_content[:last_period + 1]
35
+ end = start + last_period + 1
36
+
37
+ chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
38
+ chunk_index += 1
39
+ start = end - CHUNK_OVERLAP
40
+
41
+ if start >= len(text) - CHUNK_OVERLAP:
42
+ break
43
+
44
+ return chunks
45
+
46
+ # ============== DOCUMENT PARSERS ==============
47
+ def parse_pdf(file_bytes) -> str:
48
+ text_parts = []
49
+ with pdfplumber.open(BytesIO(file_bytes)) as pdf:
50
+ for i, page in enumerate(pdf.pages):
51
+ page_text = page.extract_text() or ""
52
+ if page_text.strip():
53
+ text_parts.append(f"[Page {i + 1}]\n{page_text}")
54
+ return "\n\n".join(text_parts)
55
+
56
+ def parse_docx(file_bytes) -> str:
57
+ doc = Document(BytesIO(file_bytes))
58
+ paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
59
+ return "\n\n".join(paragraphs)
60
+
61
+ def parse_txt(file_bytes) -> str:
62
+ return file_bytes.decode("utf-8")
63
+
64
+ def parse_image(file_bytes) -> str:
65
+ return "[Image uploaded - OCR not available in cloud version]"
66
+
67
+ def parse_csv(file_bytes) -> str:
68
+ df = pd.read_csv(BytesIO(file_bytes))
69
+ lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
70
+ for idx, row in df.head(50).iterrows():
71
+ row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
72
+ lines.append(row_text)
73
+ return "\n".join(lines)
74
+
75
+ def parse_document(file_bytes, filename) -> dict:
76
+ ext = filename.split(".")[-1].lower()
77
+
78
+ if ext == "pdf":
79
+ text = parse_pdf(file_bytes)
80
+ elif ext == "docx":
81
+ text = parse_docx(file_bytes)
82
+ elif ext == "txt":
83
+ text = parse_txt(file_bytes)
84
+ elif ext in ["jpg", "jpeg", "png"]:
85
+ text = parse_image(file_bytes)
86
+ elif ext == "csv":
87
+ text = parse_csv(file_bytes)
88
+ else:
89
+ text = ""
90
+
91
+ chunks = chunk_text(text)
92
+ for chunk in chunks:
93
+ chunk["source"] = filename
94
+ chunk["file_type"] = ext
95
+
96
+ return {"text": text, "chunks": chunks}
97
+
98
+ # ============== EMBEDDING SERVICE ==============
99
+ @st.cache_resource
100
+ def load_embedding_model():
101
+ return SentenceTransformer("all-MiniLM-L6-v2")
102
+
103
+ def embed_texts(texts: list[str]) -> np.ndarray:
104
+ model = load_embedding_model()
105
+ return model.encode(texts)
106
+
107
+ # ============== VECTOR STORE ==============
108
+ class SimpleVectorStore:
109
+ def __init__(self):
110
+ self.index = None
111
+ self.documents = []
112
+ self.dimension = 384
113
+
114
+ def add_documents(self, chunks: list[dict]):
115
+ if not chunks:
116
+ return 0
117
+
118
+ texts = [c["content"] for c in chunks]
119
+ embeddings = embed_texts(texts).astype("float32")
120
+
121
+ if self.index is None:
122
+ self.index = faiss.IndexFlatL2(self.dimension)
123
+
124
+ self.index.add(embeddings)
125
+ self.documents.extend(chunks)
126
+ return len(chunks)
127
+
128
+ def search(self, query: str, top_k: int = 5) -> list[dict]:
129
+ if self.index is None or self.index.ntotal == 0:
130
+ return []
131
+
132
+ query_embedding = embed_texts([query]).astype("float32")
133
+ distances, indices = self.index.search(query_embedding, top_k)
134
+
135
+ results = []
136
+ for i, idx in enumerate(indices[0]):
137
+ if 0 <= idx < len(self.documents):
138
+ doc = self.documents[idx].copy()
139
+ doc["score"] = float(distances[0][i])
140
+ results.append(doc)
141
+ return results
142
+
143
+ def clear(self):
144
+ self.index = None
145
+ self.documents = []
146
+
147
+ # ============== LLM SERVICE ==============
148
+ @st.cache_resource
149
+ def get_llm_client():
150
+ token = os.getenv("HUGGINGFACE_API_KEY", "")
151
+ if not token:
152
+ try:
153
+ token = st.secrets["HUGGINGFACE_API_KEY"]
154
+ except:
155
+ token = ""
156
+ return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=token)
157
+
158
+ def generate_answer(question: str, context: str) -> str:
159
+ prompt = f"""You are a helpful assistant. Answer based on the context below.
160
+ CONTEXT:
161
+ {context}
162
+ QUESTION: {question}
163
+ ANSWER:"""
164
+
165
+ try:
166
+ client = get_llm_client()
167
+ response = client.chat_completion(
168
+ messages=[{"role": "user", "content": prompt}],
169
+ max_tokens=512,
170
+ temperature=0.7
171
+ )
172
+ return response.choices[0].message.content
173
+ except Exception as e:
174
+ return f"Error: {str(e)}"
175
+
176
+ # ============== STREAMLIT APP ==============
177
+ st.set_page_config(page_title="Smart RAG API", page_icon="πŸ”", layout="wide")
178
+
179
+ st.title("πŸ” Smart RAG API")
180
+ st.markdown("Upload documents and ask questions - Powered by HuggingFace")
181
+
182
+ if "vector_store" not in st.session_state:
183
+ st.session_state.vector_store = SimpleVectorStore()
184
+
185
+ # Sidebar
186
+ with st.sidebar:
187
+ st.header("πŸ“Š Status")
188
+ st.success("βœ… Running")
189
+ st.metric("Documents", len(st.session_state.vector_store.documents))
190
+
191
+ if st.button("πŸ—‘οΈ Clear All"):
192
+ st.session_state.vector_store.clear()
193
+ st.rerun()
194
+
195
+ st.divider()
196
+ st.markdown("**Supported:** PDF, DOCX, TXT, CSV")
197
+
198
+ # Main columns
199
+ col1, col2 = st.columns(2)
200
+
201
+ with col1:
202
+ st.header("πŸ“ Upload")
203
+ uploaded_file = st.file_uploader("Choose file", type=["pdf", "docx", "txt", "csv"])
204
+
205
+ if uploaded_file and st.button("πŸ“€ Process", type="primary"):
206
+ with st.spinner("Processing..."):
207
+ try:
208
+ parsed = parse_document(uploaded_file.getvalue(), uploaded_file.name)
209
+ added = st.session_state.vector_store.add_documents(parsed["chunks"])
210
+ st.success(f"βœ… Added {added} chunks")
211
+ except Exception as e:
212
+ st.error(f"Error: {e}")
213
+
214
+ with col2:
215
+ st.header("πŸ’¬ Ask")
216
+ question = st.text_area("Question:", placeholder="What is this about?")
217
+ top_k = st.slider("Sources", 1, 5, 3)
218
+
219
+ if st.button("πŸ” Answer", type="primary"):
220
+ if not question:
221
+ st.warning("Enter a question")
222
+ elif not st.session_state.vector_store.documents:
223
+ st.warning("Upload documents first")
224
+ else:
225
+ with st.spinner("Thinking..."):
226
+ results = st.session_state.vector_store.search(question, top_k)
227
+ if results:
228
+ context = "\n\n".join([f"[{r['source']}]: {r['content']}" for r in results])
229
+ answer = generate_answer(question, context)
230
+
231
+ st.subheader("πŸ“ Answer")
232
+ st.write(answer)
233
+
234
+ st.subheader("πŸ“š Sources")
235
+ for r in results:
236
+ with st.expander(r["source"]):
237
+ st.write(r["content"][:300])
238
 
239
+ st.divider()
240
+ st.caption("Smart RAG API - FAISS + HuggingFace")