Spaces:

EdwardConstantine
/

smart-rag-api

Sleeping

App Files Files Community

EdwardConstantine commited on 15 days ago

Commit

3154aad

verified ·

1 Parent(s): 6bc0b5d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +198 -70

src/streamlit_app.py CHANGED Viewed

@@ -1,21 +1,28 @@
 import streamlit as st
 import os
 import pdfplumber
 from io import BytesIO
-from PIL import Image
 from docx import Document
 import pandas as pd
 import numpy as np
 import faiss
-from sentence_transformers import SentenceTransformer
 from huggingface_hub import InferenceClient
 # ============== CONFIG ==============
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
-# ============== TEXT PROCESSING ==============
-def chunk_text(text: str) -> list[dict]:
     if not text or not text.strip():
         return []
@@ -28,13 +35,17 @@ def chunk_text(text: str) -> list[dict]:
         end = start + CHUNK_SIZE
         chunk_content = text[start:end]
         if end < len(text):
             last_period = chunk_content.rfind(". ")
             if last_period > CHUNK_SIZE * 0.5:
                 chunk_content = chunk_content[:last_period + 1]
                 end = start + last_period + 1
-        chunks.append({"content": chunk_content.strip(), "chunk_index": chunk_index})
         chunk_index += 1
         start = end - CHUNK_OVERLAP
@@ -44,7 +55,8 @@ def chunk_text(text: str) -> list[dict]:
     return chunks
 # ============== DOCUMENT PARSERS ==============
-def parse_pdf(file_bytes) -> str:
     text_parts = []
     with pdfplumber.open(BytesIO(file_bytes)) as pdf:
         for i, page in enumerate(pdf.pages):
@@ -53,26 +65,31 @@ def parse_pdf(file_bytes) -> str:
                 text_parts.append(f"[Page {i + 1}]\n{page_text}")
     return "\n\n".join(text_parts)
-def parse_docx(file_bytes) -> str:
     doc = Document(BytesIO(file_bytes))
     paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
     return "\n\n".join(paragraphs)
-def parse_txt(file_bytes) -> str:
     return file_bytes.decode("utf-8")
-def parse_image(file_bytes) -> str:
-    return "[Image uploaded - OCR not available in cloud version]"
-def parse_csv(file_bytes) -> str:
     df = pd.read_csv(BytesIO(file_bytes))
-    lines = [f"Columns: {', '.join(df.columns.tolist())}", f"Total rows: {len(df)}", "\nData:"]
     for idx, row in df.head(50).iterrows():
         row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
         lines.append(row_text)
     return "\n".join(lines)
-def parse_document(file_bytes, filename) -> dict:
     ext = filename.split(".")[-1].lower()
     if ext == "pdf":
@@ -81,55 +98,76 @@ def parse_document(file_bytes, filename) -> dict:
         text = parse_docx(file_bytes)
     elif ext == "txt":
         text = parse_txt(file_bytes)
-    elif ext in ["jpg", "jpeg", "png"]:
-        text = parse_image(file_bytes)
     elif ext == "csv":
         text = parse_csv(file_bytes)
     else:
-        text = ""
     chunks = chunk_text(text)
     for chunk in chunks:
         chunk["source"] = filename
         chunk["file_type"] = ext
     return {"text": text, "chunks": chunks}
-# ============== EMBEDDING SERVICE ==============
-@st.cache_resource
-def load_embedding_model():
-    return SentenceTransformer("all-MiniLM-L6-v2")
-def embed_texts(texts: list[str]) -> np.ndarray:
-    model = load_embedding_model()
-    return model.encode(texts)
-# ============== VECTOR STORE ==============
-class SimpleVectorStore:
     def __init__(self):
         self.index = None
         self.documents = []
-        self.dimension = 384
-    def add_documents(self, chunks: list[dict]):
         if not chunks:
             return 0
         texts = [c["content"] for c in chunks]
-        embeddings = embed_texts(texts).astype("float32")
         if self.index is None:
-            self.index = faiss.IndexFlatL2(self.dimension)
         self.index.add(embeddings)
         self.documents.extend(chunks)
         return len(chunks)
-    def search(self, query: str, top_k: int = 5) -> list[dict]:
         if self.index is None or self.index.ntotal == 0:
             return []
-        query_embedding = embed_texts([query]).astype("float32")
         distances, indices = self.index.search(query_embedding, top_k)
         results = []
@@ -141,19 +179,43 @@ class SimpleVectorStore:
         return results
     def clear(self):
         self.index = None
         self.documents = []
-# ============== LLM SERVICE ==============
-@st.cache_resource
 def get_llm_client():
-    return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta")
-def generate_answer(question: str, context: str) -> str:
-    prompt = f"""You are a helpful assistant. Answer based on the context below.
 CONTEXT:
 {context}
 QUESTION: {question}
 ANSWER:"""
     try:
@@ -167,68 +229,134 @@ ANSWER:"""
     except Exception as e:
         return f"Error: {str(e)}"
-# ============== STREAMLIT APP ==============
-st.set_page_config(page_title="Smart RAG API", page_icon="🔍", layout="wide")
 st.title("🔍 Smart RAG API")
-st.markdown("Upload documents and ask questions - Powered by HuggingFace")
 if "vector_store" not in st.session_state:
-    st.session_state.vector_store = SimpleVectorStore()
 # Sidebar
 with st.sidebar:
     st.header("📊 Status")
     st.success("✅ Running")
-    st.metric("Documents", len(st.session_state.vector_store.documents))
-    if st.button("🗑️ Clear All"):
         st.session_state.vector_store.clear()
         st.rerun()
     st.divider()
-    st.markdown("**Supported:** PDF, DOCX, TXT, CSV")
-# Main columns
 col1, col2 = st.columns(2)
 with col1:
-    st.header("📁 Upload")
-    uploaded_file = st.file_uploader("Choose file", type=["pdf", "docx", "txt", "csv"])
-    if uploaded_file and st.button("📤 Process", type="primary"):
-        with st.spinner("Processing..."):
-            try:
-                parsed = parse_document(uploaded_file.getvalue(), uploaded_file.name)
-                added = st.session_state.vector_store.add_documents(parsed["chunks"])
-                st.success(f"✅ Added {added} chunks")
-            except Exception as e:
-                st.error(f"Error: {e}")
 with col2:
-    st.header("💬 Ask")
-    question = st.text_area("Question:", placeholder="What is this about?")
-    top_k = st.slider("Sources", 1, 5, 3)
-    if st.button("🔍 Answer", type="primary"):
         if not question:
-            st.warning("Enter a question")
-        elif not st.session_state.vector_store.documents:
-            st.warning("Upload documents first")
         else:
-            with st.spinner("Thinking..."):
                 results = st.session_state.vector_store.search(question, top_k)
                 if results:
-                    context = "\n\n".join([f"[{r['source']}]: {r['content']}" for r in results])
                     answer = generate_answer(question, context)
                     st.subheader("📝 Answer")
-                    st.write(answer)
                     st.subheader("📚 Sources")
-                    for r in results:
-                        with st.expander(r["source"]):
-                            st.write(r["content"][:300])
 st.divider()
-st.caption("Smart RAG API - FAISS + HuggingFace")

 import streamlit as st
 import os
+import re
 import pdfplumber
 from io import BytesIO
 from docx import Document
 import pandas as pd
 import numpy as np
 import faiss
 from huggingface_hub import InferenceClient
+# ============================================
+# SMART RAG API - HuggingFace Space Version
+# Technologies: FastAPI, FAISS, HuggingFace Hub
+# Parsers: pdfplumber, python-docx, pandas
+# ============================================
 # ============== CONFIG ==============
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
+EMBEDDING_DIM = 384
+# ============== TEXT CHUNKING ==============
+def chunk_text(text):
+    """Convert text into clean, meaningful chunks with overlap."""
     if not text or not text.strip():
         return []
         end = start + CHUNK_SIZE
         chunk_content = text[start:end]
+        # Try to break at sentence boundary
         if end < len(text):
             last_period = chunk_content.rfind(". ")
             if last_period > CHUNK_SIZE * 0.5:
                 chunk_content = chunk_content[:last_period + 1]
                 end = start + last_period + 1
+        chunks.append({
+            "content": chunk_content.strip(),
+            "chunk_index": chunk_index
+        })
         chunk_index += 1
         start = end - CHUNK_OVERLAP
     return chunks
 # ============== DOCUMENT PARSERS ==============
+def parse_pdf(file_bytes):
+    """.pdf via pdfplumber"""
     text_parts = []
     with pdfplumber.open(BytesIO(file_bytes)) as pdf:
         for i, page in enumerate(pdf.pages):
                 text_parts.append(f"[Page {i + 1}]\n{page_text}")
     return "\n\n".join(text_parts)
+def parse_docx(file_bytes):
+    """.docx via python-docx"""
     doc = Document(BytesIO(file_bytes))
     paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
     return "\n\n".join(paragraphs)
+def parse_txt(file_bytes):
+    """.txt directly"""
     return file_bytes.decode("utf-8")
+def parse_csv(file_bytes):
+    """.csv using pandas"""
     df = pd.read_csv(BytesIO(file_bytes))
+    lines = [
+        f"Columns: {', '.join(df.columns.tolist())}",
+        f"Total rows: {len(df)}",
+        "\nData:"
+    ]
     for idx, row in df.head(50).iterrows():
         row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
         lines.append(row_text)
     return "\n".join(lines)
+def parse_document(file_bytes, filename):
+    """Parse document and return chunks with metadata."""
     ext = filename.split(".")[-1].lower()
     if ext == "pdf":
         text = parse_docx(file_bytes)
     elif ext == "txt":
         text = parse_txt(file_bytes)
     elif ext == "csv":
         text = parse_csv(file_bytes)
     else:
+        text = f"[Unsupported file type: {ext}]"
     chunks = chunk_text(text)
+    # Add metadata (filename, chunk index)
     for chunk in chunks:
         chunk["source"] = filename
         chunk["file_type"] = ext
     return {"text": text, "chunks": chunks}
+# ============== EMBEDDINGS (HuggingFace style) ==============
+def simple_tokenize(text):
+    """Simple word tokenization."""
+    text = text.lower()
+    tokens = re.findall(r'\b[a-z]+\b', text)
+    return tokens
+def hash_embed(text, dim=EMBEDDING_DIM):
+    """Simple hash-based embedding (lightweight alternative to sentence-transformers)."""
+    tokens = simple_tokenize(text)
+    vector = np.zeros(dim)
+    for token in tokens:
+        idx = hash(token) % dim
+        vector[idx] += 1
+    # Normalize
+    norm = np.linalg.norm(vector)
+    if norm > 0:
+        vector = vector / norm
+    return vector
+def embed_texts(texts):
+    """Generate embeddings for multiple texts."""
+    return np.array([hash_embed(t) for t in texts]).astype("float32")
+# ============== VECTOR STORE (FAISS) ==============
+class VectorStore:
+    """Store embeddings in FAISS for similarity search."""
     def __init__(self):
         self.index = None
         self.documents = []
+    def add_documents(self, chunks):
+        """Add document chunks to FAISS index."""
         if not chunks:
             return 0
         texts = [c["content"] for c in chunks]
+        embeddings = embed_texts(texts)
         if self.index is None:
+            self.index = faiss.IndexFlatL2(EMBEDDING_DIM)
         self.index.add(embeddings)
         self.documents.extend(chunks)
         return len(chunks)
+    def search(self, query, top_k=5):
+        """Perform similarity search."""
         if self.index is None or self.index.ntotal == 0:
             return []
+        query_embedding = embed_texts([query])
         distances, indices = self.index.search(query_embedding, top_k)
         results = []
         return results
     def clear(self):
+        """Clear all documents."""
         self.index = None
         self.documents = []
+    def get_stats(self):
+        """Get store statistics."""
+        return {
+            "total_documents": len(self.documents),
+            "index_size": self.index.ntotal if self.index else 0
+        }
+# ============== LLM SERVICE (HuggingFace Hub) ==============
 def get_llm_client():
+    """Get HuggingFace Inference Client."""
+    token = os.getenv("HUGGINGFACE_API_KEY", "")
+    if not token:
+        try:
+            token = st.secrets["HUGGINGFACE_API_KEY"]
+        except:
+            token = ""
+    return InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=token if token else None)
+def generate_answer(question, context):
+    """Send prompt to LLM and return answer."""
+    prompt = f"""You are a helpful assistant that answers questions based on the provided context.
 CONTEXT:
 {context}
+INSTRUCTIONS:
+- Answer the question based ONLY on the context provided above.
+- If the context doesn't contain enough information, say so.
+- Be concise and direct.
+- Mention which source the information comes from if relevant.
 QUESTION: {question}
 ANSWER:"""
     try:
     except Exception as e:
         return f"Error: {str(e)}"
+# ============== STREAMLIT UI ==============
+st.set_page_config(
+    page_title="Smart RAG API",
+    page_icon="🔍",
+    layout="wide"
+)
 st.title("🔍 Smart RAG API")
+st.markdown("""
+**Retrieval-Augmented Generation API** - Upload documents and ask questions!
+**Technologies:** FastAPI • FAISS • pdfplumber • python-docx • pandas • HuggingFace Hub
+""")
+# Initialize vector store
 if "vector_store" not in st.session_state:
+    st.session_state.vector_store = VectorStore()
 # Sidebar
 with st.sidebar:
     st.header("📊 Status")
+    stats = st.session_state.vector_store.get_stats()
     st.success("✅ Running")
+    st.metric("Documents in Store", stats["total_documents"])
+    st.metric("Index Size", stats["index_size"])
+    st.divider()
+    if st.button("🗑️ Clear All Documents"):
         st.session_state.vector_store.clear()
+        st.success("Cleared!")
         st.rerun()
     st.divider()
+    st.markdown("### 📁 Supported Files")
+    st.markdown("""
+    - 📕 **PDF** (pdfplumber)
+    - 📝 **DOCX** (python-docx)
+    - 📄 **TXT** (direct)
+    - 📊 **CSV** (pandas)
+    """)
+    st.divider()
+    st.markdown("### 🛠️ Tech Stack")
+    st.markdown("""
+    - **Vector Store:** FAISS
+    - **LLM:** HuggingFace Hub
+    - **Embeddings:** Custom (lightweight)
+    - **UI:** Streamlit
+    """)
+# Main layout
 col1, col2 = st.columns(2)
+# Upload Section
 with col1:
+    st.header("📤 Upload Document")
+    uploaded_file = st.file_uploader(
+        "Choose a file",
+        type=["pdf", "docx", "txt", "csv"],
+        help="Supported: PDF, DOCX, TXT, CSV"
+    )
+    if uploaded_file:
+        file_icon = {"pdf": "📕", "docx": "📝", "txt": "📄", "csv": "📊"}
+        ext = uploaded_file.name.split(".")[-1].lower()
+        st.info(f"{file_icon.get(ext, '📁')} **{uploaded_file.name}** ({uploaded_file.size} bytes)")
+        if st.button("📤 Process Document", type="primary"):
+            with st.spinner("Processing document..."):
+                try:
+                    file_bytes = uploaded_file.getvalue()
+                    parsed = parse_document(file_bytes, uploaded_file.name)
+                    added = st.session_state.vector_store.add_documents(parsed["chunks"])
+                    st.success(f"✅ Success! Added **{added} chunks** to knowledge base.")
+                    st.json({
+                        "filename": uploaded_file.name,
+                        "file_type": ext,
+                        "chunks_created": added
+                    })
+                except Exception as e:
+                    st.error(f"❌ Error: {str(e)}")
+# Query Section
 with col2:
+    st.header("💬 Ask Questions")
+    question = st.text_area(
+        "Your question:",
+        placeholder="What is this document about?",
+        height=100
+    )
+    top_k = st.slider("Number of sources to retrieve", 1, 10, 3)
+    if st.button("🔍 Search & Answer", type="primary"):
         if not question:
+            st.warning("⚠️ Please enter a question")
+        elif st.session_state.vector_store.get_stats()["total_documents"] == 0:
+            st.warning("⚠️ Please upload documents first")
         else:
+            with st.spinner("Searching and generating answer..."):
+                # Vector search
                 results = st.session_state.vector_store.search(question, top_k)
                 if results:
+                    # Build context
+                    context_parts = []
+                    for i, r in enumerate(results, 1):
+                        context_parts.append(f"[Source {i}: {r['source']}]\n{r['content']}")
+                    context = "\n\n".join(context_parts)
+                    # Generate answer via LLM
                     answer = generate_answer(question, context)
+                    # Display answer
                     st.subheader("📝 Answer")
+                    st.markdown(answer)
+                    # Display sources
                     st.subheader("📚 Sources")
+                    for i, r in enumerate(results, 1):
+                        with st.expander(f"Source {i}: {r['source']} (score: {r['score']:.3f})"):
+                            st.write(r["content"][:500] + "..." if len(r["content"]) > 500 else r["content"])
+                else:
+                    st.warning("No relevant documents found.")
+# Footer
 st.divider()
+st.caption("🚀 **Smart RAG API** | Built with FAISS, HuggingFace Hub, pdfplumber, python-docx, pandas | By Emon Karmoker")