Spaces:
Runtime error
Runtime error
| import os | |
| import fitz # PyMuPDF | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import pipeline | |
| import gradio as gr | |
| # --- SETTINGS --- | |
| PDF_DIR = "data" | |
| MAX_TOKENS = 500 | |
| TOP_K = 4 | |
| # --- MODELS --- | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2") | |
| # --- UTILS --- | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def chunk_text(text, max_tokens=MAX_TOKENS): | |
| sentences = text.split(". ") | |
| chunks, chunk = [], "" | |
| for sentence in sentences: | |
| if len((chunk + sentence).split()) > max_tokens: | |
| chunks.append(chunk) | |
| chunk = sentence + ". " | |
| else: | |
| chunk += sentence + ". " | |
| chunks.append(chunk) | |
| return chunks | |
| # --- LOAD & INDEX ALL PDFs --- | |
| print("π Loading and indexing all PDFs in /data ...") | |
| all_chunks = [] | |
| chunk_to_doc = [] | |
| for filename in os.listdir(PDF_DIR): | |
| if filename.endswith(".pdf"): | |
| path = os.path.join(PDF_DIR, filename) | |
| text = extract_text_from_pdf(path) | |
| chunks = chunk_text(text) | |
| all_chunks.extend(chunks) | |
| chunk_to_doc.extend([filename] * len(chunks)) | |
| # Embed and index | |
| embeddings = embed_model.encode(all_chunks) | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(np.array(embeddings)) | |
| print(f"β Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.") | |
| # --- QA FUNCTION --- | |
| def answer_question(question): | |
| question_embedding = embed_model.encode([question]) | |
| _, top_indices = index.search(np.array(question_embedding), k=TOP_K) | |
| context_chunks = [all_chunks[i] for i in top_indices[0]] | |
| source_docs = [chunk_to_doc[i] for i in top_indices[0]] | |
| context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))]) | |
| prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:" | |
| output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"] | |
| return output.split("Answer:")[-1].strip() | |
| # --- UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π€ PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.") | |
| with gr.Row(): | |
| question = gr.Textbox(label="Your Question") | |
| button = gr.Button("Get Answer") | |
| answer = gr.Textbox(label="Answer") | |
| button.click(fn=answer_question, inputs=question, outputs=answer) | |
| demo.launch() | |