Spaces:

MohammadYaseen
/

Simple-RAG-Applicstion-Test

Sleeping

App Files Files Community

MohammadYaseen commited on Nov 24, 2024

Commit

e6fb287

verified ·

1 Parent(s): 03b3412

Create app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import pandas as pd
+import PyPDF2
+import docx
+from sentence_transformers import SentenceTransformer
+import faiss
+from groq import Groq
+import streamlit as st
+# Initialize Groq API Client
+client = Groq(api_key="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")
+# SentenceTransformer model for embeddings
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# FAISS index for retrieval
+dimension = 384  # Dimension of 'all-MiniLM-L6-v2' embeddings
+index = faiss.IndexFlatL2(dimension)
+document_texts = []  # Store text corresponding to embeddings
+# Helper function: Extract text from different file types
+def extract_text_from_file(file):
+    text = ""
+    if file.name.endswith(".pdf"):
+        pdf_reader = PyPDF2.PdfReader(file)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    elif file.name.endswith(".csv"):
+        df = pd.read_csv(file)
+        text = "\n".join([" ".join(map(str, row)) for row in df.values])
+    elif file.name.endswith(".xlsx") or file.name.endswith(".xls"):
+        df = pd.read_excel(file)
+        text = "\n".join([" ".join(map(str, row)) for row in df.values])
+    elif file.name.endswith(".txt"):
+        text = file.read().decode("utf-8")
+    elif file.name.endswith(".docx"):
+        doc = docx.Document(file)
+        text = "\n".join([p.text for p in doc.paragraphs])
+    else:
+        text = None
+    return text
+# Add document embeddings to FAISS
+def add_to_index(text, index, document_texts):
+    sentences = text.split("\n")
+    embeddings = embedding_model.encode(sentences, convert_to_numpy=True)
+    index.add(embeddings)
+    document_texts.extend(sentences)
+# Perform RAG Query
+def rag_query(query, index, document_texts, top_k=3):
+    """
+    Perform a RAG query: Retrieve relevant documents and generate a response.
+    """
+    # Generate query embedding and retrieve closest matches
+    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
+    distances, indices = index.search(query_embedding, top_k)
+    # Build the context from retrieved documents
+    retrieved_context = " ".join([document_texts[idx] for idx in indices[0]])
+    # Construct the prompt for the Groq model
+    prompt = f"Context: {retrieved_context}\n\nQuestion: {query}"
+    # Generate a response using Groq API
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "user", "content": prompt}
+        ],
+        model="gemma2-9b-it",
+    )
+    return chat_completion.choices[0].message.content
+# Streamlit UI
+st.title("RAG-Based Document Q&A")
+st.write("Upload your documents and ask questions based on the content.")
+uploaded_files = st.file_uploader(
+    "Upload PDFs, CSVs, Excel, or Text files",
+    type=["pdf", "csv", "xlsx", "xls", "txt", "docx"],
+    accept_multiple_files=True,
+)
+if uploaded_files:
+    for file in uploaded_files:
+        with st.spinner(f"Processing {file.name}..."):
+            text = extract_text_from_file(file)
+            if text:
+                add_to_index(text, index, document_texts)
+                st.success(f"Processed {file.name}")
+            else:
+                st.error(f"Could not process {file.name}. Unsupported file format.")
+query = st.text_input("Enter your question:")
+if query:
+    with st.spinner("Generating response..."):
+        response = rag_query(query, index, document_texts)
+        st.write("### Answer:")
+        st.write(response)