# app.py import gradio as gr from PyPDF2 import PdfReader from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import os # Load LLM model_id = "mistralai/Mistral-7B-Instruct-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype="auto", load_in_8bit=True # Enable 8-bit quantization for resource efficiency ) llm = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9 ) # Extract and cache full PDF text (as list of chunks) def extract_text_chunks(pdf_file, chunk_size=1500, overlap=200): reader = PdfReader(pdf_file) full_text = "" for page in reader.pages: full_text += page.extract_text() or "" chunks = [] start = 0 while start < len(full_text): end = start + chunk_size chunks.append(full_text[start:end]) start += chunk_size - overlap return chunks # Find best matching chunk based on query keywords def find_relevant_chunk(chunks, query): best_score = 0 best_chunk = "" query_words = set(query.lower().split()) for chunk in chunks: chunk_words = set(chunk.lower().split()) score = len(query_words.intersection(chunk_words)) if score > best_score: best_score = score best_chunk = chunk return best_chunk # Generate answer using LLM def answer_query_from_pdf(pdf_file, query): if not pdf_file: return "Please upload a PDF file." if not query: return "Please enter a question." chunks = extract_text_chunks(pdf_file.name) relevant_chunk = find_relevant_chunk(chunks, query) prompt = ( f"You are a helpful assistant. Based on the following document excerpt:\n\n" f"{relevant_chunk}\n\n" f"Answer this question: {query}" ) result = llm(prompt)[0]["generated_text"] return result.replace(prompt, "").strip() # Gradio UI demo = gr.Interface( fn=answer_query_from_pdf, inputs=[ gr.File(file_types=[".pdf"], label="Upload a large PDF (up to 22MB)"), gr.Textbox(lines=2, placeholder="Ask a question about the PDF...", label="Your Question") ], outputs="text", title="🔍 Ask Questions from a Large PDF", description="Upload a large PDF and ask questions. The bot finds relevant text and answers using Mistral-7B." ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)