Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import PyPDF2 | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from groq import Groq | |
| # ------------------ CONFIG ------------------ | |
| LLM_MODEL = "llama-3.1-8b-instant" | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Initialize sentence transformer model | |
| embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Global storage for documents and embeddings | |
| documents = [] | |
| embeddings = [] | |
| metadata = [] | |
| # ------------------ PDF Processing ------------------ | |
| def process_pdf(pdf_file): | |
| global documents, embeddings, metadata | |
| documents = [] | |
| metadata = [] | |
| reader = PyPDF2.PdfReader(pdf_file.name) | |
| for i, page in enumerate(reader.pages): | |
| text = page.extract_text() | |
| if text and text.strip(): | |
| documents.append(text) | |
| metadata.append(f"{pdf_file.name} - Page {i+1}") | |
| if not documents: | |
| return "No text extracted from PDF. Are you sure it contains text?" | |
| embeddings = embedder.encode(documents) | |
| return f"β Processed {len(documents)} text chunks from PDF: {pdf_file.name}" | |
| # ------------------ Retrieve Context ------------------ | |
| def retrieve_context(question, top_k=5): | |
| q_emb = embedder.encode([question]) | |
| scores = cosine_similarity(q_emb, embeddings)[0] | |
| top_indices = np.argsort(scores)[-top_k:][::-1] | |
| context = "" | |
| sources = [] | |
| for idx in top_indices: | |
| context += documents[idx] + "\n" | |
| sources.append(metadata[idx]) | |
| return context, sources | |
| # ------------------ Chat with Groq ------------------ | |
| def chat(question): | |
| if not documents: | |
| return "Please upload and process a PDF first." | |
| context, sources = retrieve_context(question) | |
| try: | |
| response = client.chat.completions.create( | |
| model=LLM_MODEL, | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant answering questions based on the provided PDF context."}, | |
| {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"} | |
| ], | |
| temperature=0.2 | |
| ) | |
| answer = response.choices[0].message.content | |
| answer += "\n\nSources:\n" + "\n".join(sources) | |
| return answer | |
| except Exception as e: | |
| return f"Error communicating with Groq: {e}" | |
| # ------------------ GRADIO UI ------------------ | |
| with gr.Blocks(title="RAG PDF Chatbot") as demo: | |
| gr.Markdown("# π RAG PDF Chatbot") | |
| gr.Markdown("Upload a PDF, process it, and ask questions based on its content.") | |
| pdf_input = gr.File(label="Upload PDF", file_types=['.pdf']) # β single PDF only | |
| process_status = gr.Markdown() | |
| process_btn = gr.Button("Process PDF") | |
| question = gr.Textbox(label="Ask a question") | |
| ask_btn = gr.Button("Ask") | |
| answer = gr.Markdown(label="Answer") | |
| process_btn.click(process_pdf, pdf_input, process_status) | |
| ask_btn.click(chat, question, answer) | |
| demo.launch() |