PDF_FAQ_Chatbot / app.py
ARtOrias11's picture
Update app.py
ddd0ab4 verified
# app.py
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import os
# Load LLM
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype="auto",
load_in_8bit=True # Enable 8-bit quantization for resource efficiency
)
llm = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9
)
# Extract and cache full PDF text (as list of chunks)
def extract_text_chunks(pdf_file, chunk_size=1500, overlap=200):
reader = PdfReader(pdf_file)
full_text = ""
for page in reader.pages:
full_text += page.extract_text() or ""
chunks = []
start = 0
while start < len(full_text):
end = start + chunk_size
chunks.append(full_text[start:end])
start += chunk_size - overlap
return chunks
# Find best matching chunk based on query keywords
def find_relevant_chunk(chunks, query):
best_score = 0
best_chunk = ""
query_words = set(query.lower().split())
for chunk in chunks:
chunk_words = set(chunk.lower().split())
score = len(query_words.intersection(chunk_words))
if score > best_score:
best_score = score
best_chunk = chunk
return best_chunk
# Generate answer using LLM
def answer_query_from_pdf(pdf_file, query):
if not pdf_file:
return "Please upload a PDF file."
if not query:
return "Please enter a question."
chunks = extract_text_chunks(pdf_file.name)
relevant_chunk = find_relevant_chunk(chunks, query)
prompt = (
f"You are a helpful assistant. Based on the following document excerpt:\n\n"
f"{relevant_chunk}\n\n"
f"Answer this question: {query}"
)
result = llm(prompt)[0]["generated_text"]
return result.replace(prompt, "").strip()
# Gradio UI
demo = gr.Interface(
fn=answer_query_from_pdf,
inputs=[
gr.File(file_types=[".pdf"], label="Upload a large PDF (up to 22MB)"),
gr.Textbox(lines=2, placeholder="Ask a question about the PDF...", label="Your Question")
],
outputs="text",
title="๐Ÿ” Ask Questions from a Large PDF",
description="Upload a large PDF and ask questions. The bot finds relevant text and answers using Mistral-7B."
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)