Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| import pandas as pd | |
| import docx | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| from groq import Groq | |
| import numpy as np | |
| from sklearn.preprocessing import normalize | |
| # Initialize Groq API | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Initialize SentenceTransformer model | |
| embedder_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Helper function to extract text from PDF | |
| def extract_text_from_pdf(file): | |
| pdf_reader = PdfReader(file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Helper function to extract text from Excel | |
| def extract_text_from_excel(file): | |
| df = pd.read_excel(file) | |
| return df.to_string() | |
| # Helper function to extract text from Word document | |
| def extract_text_from_docx(file): | |
| doc = docx.Document(file) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return text | |
| # Function to chunk text into smaller parts | |
| def chunk_text(text, chunk_size=512): | |
| # Split text into chunks of specified size | |
| chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| return chunks | |
| # Function to create FAISS index and store embeddings | |
| def create_faiss_index(texts, model): | |
| embeddings = model.encode(texts) | |
| embeddings = normalize(embeddings) # Normalize embeddings for better comparison | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) # Create FAISS index | |
| index.add(embeddings) # Add embeddings to FAISS index | |
| return index, embeddings | |
| # Function to retrieve context from FAISS | |
| def retrieve_context(query, index, texts, model, top_k=5): | |
| query_embedding = model.encode([query]) | |
| distances, indices = index.search(query_embedding, top_k) | |
| retrieved_texts = [texts[i] for i in indices[0]] | |
| return "\n".join(retrieved_texts) | |
| # Function to query Groq API | |
| def query_groq_api(context, question): | |
| try: | |
| response = client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": f"Context: {context}\nQuestion: {question}"} | |
| ], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Error querying Groq API: {e}" | |
| # Streamlit App | |
| def main(): | |
| st.title("RAG-based Document Q&A") | |
| st.write("Upload a document, and ask questions based on its content.") | |
| uploaded_file = st.file_uploader("Upload your document", type=["pdf", "xlsx", "docx", "txt"]) | |
| user_question = st.text_input("Enter your question:") | |
| if uploaded_file is not None: | |
| # Extract text based on file type | |
| if uploaded_file.name.endswith(".pdf"): | |
| context = extract_text_from_pdf(uploaded_file) | |
| elif uploaded_file.name.endswith(".xlsx"): | |
| context = extract_text_from_excel(uploaded_file) | |
| elif uploaded_file.name.endswith(".docx"): | |
| context = extract_text_from_docx(uploaded_file) | |
| elif uploaded_file.name.endswith(".txt"): | |
| context = uploaded_file.read().decode("utf-8") | |
| else: | |
| st.error("Unsupported file format!") | |
| return | |
| # Chunk the extracted text into smaller segments | |
| chunks = chunk_text(context) | |
| # Create FAISS index for the text chunks | |
| index, embeddings = create_faiss_index(chunks, embedder_model) | |
| if user_question: | |
| if st.button("Submit Question"): | |
| st.write("Answer:") | |
| # Retrieve relevant context from the FAISS index | |
| retrieved_context = retrieve_context(user_question, index, chunks, embedder_model) | |
| # Query Groq API with the context and question | |
| answer = query_groq_api(retrieved_context, user_question) | |
| st.success(answer) | |
| if __name__ == "__main__": | |
| main() |