File size: 2,580 Bytes
ddd0ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# app.py
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import os

# Load LLM
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    load_in_8bit=True  # Enable 8-bit quantization for resource efficiency
)

llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

# Extract and cache full PDF text (as list of chunks)
def extract_text_chunks(pdf_file, chunk_size=1500, overlap=200):
    reader = PdfReader(pdf_file)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() or ""

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start += chunk_size - overlap
    return chunks

# Find best matching chunk based on query keywords
def find_relevant_chunk(chunks, query):
    best_score = 0
    best_chunk = ""
    query_words = set(query.lower().split())

    for chunk in chunks:
        chunk_words = set(chunk.lower().split())
        score = len(query_words.intersection(chunk_words))
        if score > best_score:
            best_score = score
            best_chunk = chunk
    return best_chunk

# Generate answer using LLM
def answer_query_from_pdf(pdf_file, query):
    if not pdf_file:
        return "Please upload a PDF file."
    if not query:
        return "Please enter a question."
    
    chunks = extract_text_chunks(pdf_file.name)
    relevant_chunk = find_relevant_chunk(chunks, query)

    prompt = (
        f"You are a helpful assistant. Based on the following document excerpt:\n\n"
        f"{relevant_chunk}\n\n"
        f"Answer this question: {query}"
    )

    result = llm(prompt)[0]["generated_text"]
    return result.replace(prompt, "").strip()

# Gradio UI
demo = gr.Interface(
    fn=answer_query_from_pdf,
    inputs=[
        gr.File(file_types=[".pdf"], label="Upload a large PDF (up to 22MB)"),
        gr.Textbox(lines=2, placeholder="Ask a question about the PDF...", label="Your Question")
    ],
    outputs="text",
    title="🔍 Ask Questions from a Large PDF",
    description="Upload a large PDF and ask questions. The bot finds relevant text and answers using Mistral-7B."
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)