| import os |
| import pandas as pd |
| import PyPDF2 |
| import docx |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| from groq import Groq |
| import streamlit as st |
|
|
| |
| client = Groq(api_key="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") |
|
|
| |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| |
| dimension = 384 |
| index = faiss.IndexFlatL2(dimension) |
| document_texts = [] |
|
|
| |
| def extract_text_from_file(file): |
| text = "" |
| if file.name.endswith(".pdf"): |
| pdf_reader = PyPDF2.PdfReader(file) |
| for page in pdf_reader.pages: |
| text += page.extract_text() |
| elif file.name.endswith(".csv"): |
| df = pd.read_csv(file) |
| text = "\n".join([" ".join(map(str, row)) for row in df.values]) |
| elif file.name.endswith(".xlsx") or file.name.endswith(".xls"): |
| df = pd.read_excel(file) |
| text = "\n".join([" ".join(map(str, row)) for row in df.values]) |
| elif file.name.endswith(".txt"): |
| text = file.read().decode("utf-8") |
| elif file.name.endswith(".docx"): |
| doc = docx.Document(file) |
| text = "\n".join([p.text for p in doc.paragraphs]) |
| else: |
| text = None |
| return text |
|
|
| |
| def add_to_index(text, index, document_texts): |
| sentences = text.split("\n") |
| embeddings = embedding_model.encode(sentences, convert_to_numpy=True) |
| index.add(embeddings) |
| document_texts.extend(sentences) |
|
|
| |
| def rag_query(query, index, document_texts, top_k=3): |
| """ |
| Perform a RAG query: Retrieve relevant documents and generate a response. |
| """ |
| |
| query_embedding = embedding_model.encode([query], convert_to_numpy=True) |
| distances, indices = index.search(query_embedding, top_k) |
|
|
| |
| retrieved_context = " ".join([document_texts[idx] for idx in indices[0]]) |
|
|
| |
| prompt = f"Context: {retrieved_context}\n\nQuestion: {query}" |
| |
| |
| chat_completion = client.chat.completions.create( |
| messages=[ |
| {"role": "user", "content": prompt} |
| ], |
| model="gemma2-9b-it", |
| ) |
| return chat_completion.choices[0].message.content |
|
|
| |
| st.title("RAG-Based Document Q&A") |
| st.write("Upload your documents and ask questions based on the content.") |
|
|
| uploaded_files = st.file_uploader( |
| "Upload PDFs, CSVs, Excel, or Text files", |
| type=["pdf", "csv", "xlsx", "xls", "txt", "docx"], |
| accept_multiple_files=True, |
| ) |
|
|
| if uploaded_files: |
| for file in uploaded_files: |
| with st.spinner(f"Processing {file.name}..."): |
| text = extract_text_from_file(file) |
| if text: |
| add_to_index(text, index, document_texts) |
| st.success(f"Processed {file.name}") |
| else: |
| st.error(f"Could not process {file.name}. Unsupported file format.") |
|
|
| query = st.text_input("Enter your question:") |
| if query: |
| with st.spinner("Generating response..."): |
| response = rag_query(query, index, document_texts) |
| st.write("### Answer:") |
| st.write(response) |
|
|