Spaces:

ARtOrias11
/

PDF_FAQ_Chatbot

Runtime error

App Files Files Community

ARtOrias11 commited on May 26, 2025

Commit

ddd0ab4

verified ·

1 Parent(s): 1df9ebb

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -0

app.py CHANGED Viewed

	@@ -0,0 +1,89 @@

+# app.py
+import gradio as gr
+from PyPDF2 import PdfReader
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import os
+# Load LLM
+model_id = "mistralai/Mistral-7B-Instruct-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+    load_in_8bit=True  # Enable 8-bit quantization for resource efficiency
+)
+llm = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    do_sample=True,
+    temperature=0.7,
+    top_p=0.9
+)
+# Extract and cache full PDF text (as list of chunks)
+def extract_text_chunks(pdf_file, chunk_size=1500, overlap=200):
+    reader = PdfReader(pdf_file)
+    full_text = ""
+    for page in reader.pages:
+        full_text += page.extract_text() or ""
+    chunks = []
+    start = 0
+    while start < len(full_text):
+        end = start + chunk_size
+        chunks.append(full_text[start:end])
+        start += chunk_size - overlap
+    return chunks
+# Find best matching chunk based on query keywords
+def find_relevant_chunk(chunks, query):
+    best_score = 0
+    best_chunk = ""
+    query_words = set(query.lower().split())
+    for chunk in chunks:
+        chunk_words = set(chunk.lower().split())
+        score = len(query_words.intersection(chunk_words))
+        if score > best_score:
+            best_score = score
+            best_chunk = chunk
+    return best_chunk
+# Generate answer using LLM
+def answer_query_from_pdf(pdf_file, query):
+    if not pdf_file:
+        return "Please upload a PDF file."
+    if not query:
+        return "Please enter a question."
+    chunks = extract_text_chunks(pdf_file.name)
+    relevant_chunk = find_relevant_chunk(chunks, query)
+    prompt = (
+        f"You are a helpful assistant. Based on the following document excerpt:\n\n"
+        f"{relevant_chunk}\n\n"
+        f"Answer this question: {query}"
+    )
+    result = llm(prompt)[0]["generated_text"]
+    return result.replace(prompt, "").strip()
+# Gradio UI
+demo = gr.Interface(
+    fn=answer_query_from_pdf,
+    inputs=[
+        gr.File(file_types=[".pdf"], label="Upload a large PDF (up to 22MB)"),
+        gr.Textbox(lines=2, placeholder="Ask a question about the PDF...", label="Your Question")
+    ],
+    outputs="text",
+    title="🔍 Ask Questions from a Large PDF",
+    description="Upload a large PDF and ask questions. The bot finds relevant text and answers using Mistral-7B."
+)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)