Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| from pypdf import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from groq import Groq | |
| # ----------------------- | |
| # Initialize embedding model | |
| # ----------------------- | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # ----------------------- | |
| # Initialize Groq client | |
| # ----------------------- | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # ----------------------- | |
| # Helper functions | |
| # ----------------------- | |
| def extract_text_from_pdfs(pdf_files): | |
| text = "" | |
| for pdf in pdf_files: | |
| reader = PdfReader(pdf) | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| words = text.split() | |
| chunks = [] | |
| i = 0 | |
| while i < len(words): | |
| chunk = words[i:i + chunk_size] | |
| chunks.append(" ".join(chunk)) | |
| i += chunk_size - overlap | |
| return chunks | |
| def retrieve_chunks(pdf_files, question): | |
| if not pdf_files: | |
| return "β Please upload PDF files." | |
| if not question: | |
| return "β Please enter a question." | |
| text = extract_text_from_pdfs(pdf_files) | |
| chunks = chunk_text(text) | |
| chunk_embeddings = model.encode(chunks) | |
| question_embedding = model.encode([question]) | |
| scores = cosine_similarity(question_embedding, chunk_embeddings)[0] | |
| top_indices = np.argsort(scores)[-3:][::-1] | |
| retrieved_chunks = [chunks[i] for i in top_indices] | |
| return retrieved_chunks | |
| # ----------------------- | |
| # RAG + Groq LLM integration | |
| # ----------------------- | |
| def answer_question(pdf_files, question, history): | |
| retrieved_chunks = retrieve_chunks(pdf_files, question) | |
| if isinstance(retrieved_chunks, str): | |
| return retrieved_chunks, history | |
| context = "\n\n".join(retrieved_chunks) | |
| prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer concisely:" | |
| response = client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| model="llama-3.1-8b-instant", | |
| max_tokens=300 | |
| ) | |
| answer = response.choices[0].message.content | |
| # Update history | |
| history = history or "" | |
| history += f"Q: {question}\nA: {answer}\n\n" | |
| return answer, history | |
| # ----------------------- | |
| # PDF Summarization | |
| # ----------------------- | |
| def summarize_pdf(pdf_files): | |
| if not pdf_files: | |
| return "β Please upload PDF files first." | |
| text = extract_text_from_pdfs(pdf_files) | |
| chunks = chunk_text(text) | |
| context = "\n\n".join(chunks[:5]) # summarize first 5 chunks for speed | |
| prompt = f"Summarize the following PDF content concisely:\n\n{context}" | |
| response = client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful summarizer."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| model="llama-3.1-8b-instant", | |
| max_tokens=200 | |
| ) | |
| summary = response.choices[0].message.content | |
| return summary | |
| # ----------------------- | |
| # Gradio UI | |
| # ----------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π€ RAG PDF Chatbot with History & PDF Summarization") | |
| pdf_input = gr.File( | |
| label="Upload PDF Files", | |
| file_types=[".pdf"], | |
| file_count="multiple" | |
| ) | |
| question_input = gr.Textbox( | |
| label="Ask your question here", | |
| placeholder="e.g. What is the main objective of this document?" | |
| ) | |
| history_box = gr.Textbox( | |
| label="Answer History", | |
| lines=10, | |
| interactive=False | |
| ) | |
| answer_box = gr.Textbox( | |
| label="Answer", | |
| lines=8 | |
| ) | |
| # Buttons | |
| get_answer_btn = gr.Button("Get Answer") | |
| summarize_btn = gr.Button("Summarize PDF") | |
| # Button actions | |
| get_answer_btn.click( | |
| fn=answer_question, | |
| inputs=[pdf_input, question_input, history_box], | |
| outputs=[answer_box, history_box] | |
| ) | |
| summarize_btn.click( | |
| fn=summarize_pdf, | |
| inputs=[pdf_input], | |
| outputs=[answer_box] | |
| ) | |
| demo.launch() | |