ARtOrias11 commited on
Commit
ddd0ab4
·
verified ·
1 Parent(s): 1df9ebb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py CHANGED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from PyPDF2 import PdfReader
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
+ import os
6
+
7
+ # Load LLM
8
+ model_id = "mistralai/Mistral-7B-Instruct-v0.1"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ model_id,
13
+ device_map="auto",
14
+ torch_dtype="auto",
15
+ load_in_8bit=True # Enable 8-bit quantization for resource efficiency
16
+ )
17
+
18
+ llm = pipeline(
19
+ "text-generation",
20
+ model=model,
21
+ tokenizer=tokenizer,
22
+ max_new_tokens=512,
23
+ do_sample=True,
24
+ temperature=0.7,
25
+ top_p=0.9
26
+ )
27
+
28
+ # Extract and cache full PDF text (as list of chunks)
29
+ def extract_text_chunks(pdf_file, chunk_size=1500, overlap=200):
30
+ reader = PdfReader(pdf_file)
31
+ full_text = ""
32
+ for page in reader.pages:
33
+ full_text += page.extract_text() or ""
34
+
35
+ chunks = []
36
+ start = 0
37
+ while start < len(full_text):
38
+ end = start + chunk_size
39
+ chunks.append(full_text[start:end])
40
+ start += chunk_size - overlap
41
+ return chunks
42
+
43
+ # Find best matching chunk based on query keywords
44
+ def find_relevant_chunk(chunks, query):
45
+ best_score = 0
46
+ best_chunk = ""
47
+ query_words = set(query.lower().split())
48
+
49
+ for chunk in chunks:
50
+ chunk_words = set(chunk.lower().split())
51
+ score = len(query_words.intersection(chunk_words))
52
+ if score > best_score:
53
+ best_score = score
54
+ best_chunk = chunk
55
+ return best_chunk
56
+
57
+ # Generate answer using LLM
58
+ def answer_query_from_pdf(pdf_file, query):
59
+ if not pdf_file:
60
+ return "Please upload a PDF file."
61
+ if not query:
62
+ return "Please enter a question."
63
+
64
+ chunks = extract_text_chunks(pdf_file.name)
65
+ relevant_chunk = find_relevant_chunk(chunks, query)
66
+
67
+ prompt = (
68
+ f"You are a helpful assistant. Based on the following document excerpt:\n\n"
69
+ f"{relevant_chunk}\n\n"
70
+ f"Answer this question: {query}"
71
+ )
72
+
73
+ result = llm(prompt)[0]["generated_text"]
74
+ return result.replace(prompt, "").strip()
75
+
76
+ # Gradio UI
77
+ demo = gr.Interface(
78
+ fn=answer_query_from_pdf,
79
+ inputs=[
80
+ gr.File(file_types=[".pdf"], label="Upload a large PDF (up to 22MB)"),
81
+ gr.Textbox(lines=2, placeholder="Ask a question about the PDF...", label="Your Question")
82
+ ],
83
+ outputs="text",
84
+ title="🔍 Ask Questions from a Large PDF",
85
+ description="Upload a large PDF and ask questions. The bot finds relevant text and answers using Mistral-7B."
86
+ )
87
+
88
+ if __name__ == "__main__":
89
+ demo.launch(server_name="0.0.0.0", server_port=7860)