IW2025 commited on
Commit
be4a77c
·
verified ·
1 Parent(s): a9f34af

Upload 7 files

Browse files
Files changed (3) hide show
  1. app.py +91 -14
  2. app_config.toml +12 -8
  3. requirements.txt +10 -9
app.py CHANGED
@@ -5,6 +5,11 @@ import fitz # PyMuPDF
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
8
  import base64
9
  from PIL import Image
10
  import io
@@ -20,8 +25,11 @@ class FastPDFSearch:
20
  self.chunk_metadata = []
21
  self.vector_db = None
22
  self.embeddings = None
 
 
23
  self._process_pdfs(slides_dir)
24
  self._build_vector_db()
 
25
 
26
  def _process_pdfs(self, slides_dir):
27
  slides_path = Path(slides_dir)
@@ -53,6 +61,50 @@ class FastPDFSearch:
53
  metadatas=self.chunk_metadata,
54
  persist_directory="./chroma_db"
55
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def get_pdf_page_image(self, pdf_path, page_num):
58
  try:
@@ -74,19 +126,44 @@ class FastPDFSearch:
74
  return None
75
 
76
  def search(self, query):
77
- # Find the most relevant chunk (page)
78
- results = self.vector_db.similarity_search(query, k=1)
79
  if not results:
80
- return "No relevant page found.", None, None
81
- result = results[0]
82
- filename = result.metadata["filename"]
83
- page_number = result.metadata["page_number"]
84
- text = result.page_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  img = self.get_pdf_page_image(self.pdf_files[filename], page_number)
 
86
  if img:
87
- return text, img, f"{filename} - Page {page_number}"
88
  else:
89
- return text, None, f"{filename} - Page {page_number}"
90
 
91
  # --- Gradio UI ---
92
  searcher = FastPDFSearch()
@@ -98,15 +175,15 @@ def gradio_search(query):
98
  else:
99
  return text, []
100
 
101
- with gr.Blocks(title="Fast PDF Curriculum Search", theme=gr.themes.Soft()) as demo:
102
- gr.Markdown("# 📄 Fast PDF Curriculum Search\nAsk a question and see the most relevant slide page!")
103
  with gr.Row():
104
  with gr.Column():
105
  question = gr.Textbox(label="Ask a question", placeholder="e.g., What are for loops?", lines=2)
106
- submit = gr.Button("🔍 Search")
107
- answer = gr.Markdown(label="Relevant Page Text")
108
  with gr.Column():
109
- gallery = gr.Gallery(label="Relevant PDF Page", columns=1, rows=1, height="auto", object_fit="contain")
110
  submit.click(fn=gradio_search, inputs=question, outputs=[answer, gallery])
111
  question.submit(fn=gradio_search, inputs=question, outputs=[answer, gallery])
112
 
 
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.llms import HuggingFacePipeline
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains import LLMChain
11
+ from transformers import pipeline
12
+ import torch
13
  import base64
14
  from PIL import Image
15
  import io
 
25
  self.chunk_metadata = []
26
  self.vector_db = None
27
  self.embeddings = None
28
+ self.llm = None
29
+ self.qa_chain = None
30
  self._process_pdfs(slides_dir)
31
  self._build_vector_db()
32
+ self._setup_llm()
33
 
34
  def _process_pdfs(self, slides_dir):
35
  slides_path = Path(slides_dir)
 
61
  metadatas=self.chunk_metadata,
62
  persist_directory="./chroma_db"
63
  )
64
+
65
+ def _setup_llm(self):
66
+ try:
67
+ # Use Llama 3.1-8B for better question answering
68
+ model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
69
+ pipe = pipeline(
70
+ "text-generation",
71
+ model=model_name,
72
+ max_new_tokens=200,
73
+ temperature=0.3,
74
+ do_sample=True,
75
+ top_p=0.9,
76
+ repetition_penalty=1.1,
77
+ device_map="auto" if torch.cuda.is_available() else None
78
+ )
79
+ self.llm = HuggingFacePipeline(pipeline=pipe)
80
+
81
+ # Create a better QA prompt template for Llama
82
+ qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
83
+
84
+ You are a helpful AI assistant that answers questions about programming concepts based on curriculum content. Provide clear, accurate, and educational answers.
85
+
86
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
87
+
88
+ Based on the following curriculum content, please answer this question:
89
+
90
+ Context: {context}
91
+
92
+ Question: {question}
93
+
94
+ <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
95
+
96
+ prompt = PromptTemplate(
97
+ input_variables=["context", "question"],
98
+ template=qa_template
99
+ )
100
+
101
+ self.qa_chain = LLMChain(llm=self.llm, prompt=prompt)
102
+ print("✅ Llama 3.1-8B loaded successfully!")
103
+ except Exception as e:
104
+ print(f"Warning: Could not load Llama 3.1-8B: {e}")
105
+ print("Falling back to basic search mode...")
106
+ self.llm = None
107
+ self.qa_chain = None
108
 
109
  def get_pdf_page_image(self, pdf_path, page_num):
110
  try:
 
126
  return None
127
 
128
  def search(self, query):
129
+ # Find multiple relevant chunks for better context
130
+ results = self.vector_db.similarity_search(query, k=3)
131
  if not results:
132
+ return "No relevant content found in the curriculum.", None, None
133
+
134
+ # Get the most relevant page for display
135
+ best_result = results[0]
136
+ filename = best_result.metadata["filename"]
137
+ page_number = best_result.metadata["page_number"]
138
+
139
+ # Combine context from multiple pages
140
+ context = "\n\n".join([result.page_content for result in results])
141
+
142
+ # Generate answer if LLM is available
143
+ if self.qa_chain:
144
+ try:
145
+ answer = self.qa_chain.run(context=context, question=query)
146
+ # Clean up the answer (remove any extra formatting)
147
+ answer = answer.strip()
148
+ # Remove any remaining prompt artifacts
149
+ if "<|eot_id|>" in answer:
150
+ answer = answer.split("<|eot_id|>")[-1].strip()
151
+ if answer.startswith("Answer:"):
152
+ answer = answer[7:].strip()
153
+ except Exception as e:
154
+ print(f"Error generating answer: {e}")
155
+ answer = f"Based on the curriculum content:\n\n{best_result.page_content}"
156
+ else:
157
+ # Fallback to showing the most relevant page content
158
+ answer = f"Most relevant content from the curriculum:\n\n{best_result.page_content}"
159
+
160
+ # Get the image of the most relevant page
161
  img = self.get_pdf_page_image(self.pdf_files[filename], page_number)
162
+
163
  if img:
164
+ return answer, img, f"{filename} - Page {page_number}"
165
  else:
166
+ return answer, None, f"{filename} - Page {page_number}"
167
 
168
  # --- Gradio UI ---
169
  searcher = FastPDFSearch()
 
175
  else:
176
  return text, []
177
 
178
+ with gr.Blocks(title="AI Curriculum Assistant", theme=gr.themes.Soft()) as demo:
179
+ gr.Markdown("# 🤖 AI Curriculum Assistant\nAsk questions about programming concepts and get AI-generated answers based on the curriculum!")
180
  with gr.Row():
181
  with gr.Column():
182
  question = gr.Textbox(label="Ask a question", placeholder="e.g., What are for loops?", lines=2)
183
+ submit = gr.Button("🤖 Ask AI")
184
+ answer = gr.Markdown(label="AI Answer")
185
  with gr.Column():
186
+ gallery = gr.Gallery(label="Relevant Slide Page", columns=1, rows=1, height="auto", object_fit="contain")
187
  submit.click(fn=gradio_search, inputs=question, outputs=[answer, gallery])
188
  question.submit(fn=gradio_search, inputs=question, outputs=[answer, gallery])
189
 
app_config.toml CHANGED
@@ -11,7 +11,7 @@ HF_HOME = "/tmp/hf_home"
11
 
12
  [models]
13
  # Preload models for faster startup
14
- "microsoft/DialoGPT-medium" = "dialo-medium"
15
  "sentence-transformers/all-MiniLM-L6-v2" = "all-minilm-l6-v2"
16
 
17
  [datasets]
@@ -19,13 +19,17 @@ HF_HOME = "/tmp/hf_home"
19
 
20
  [hardware]
21
  # Hardware requirements for Gradio
22
- cpu = "2"
23
- memory = "8GB"
24
- disk = "10GB"
25
 
26
  [gradio]
27
  # Gradio specific settings
28
- title = "Inclusive World Curriculum Assistant"
29
- description = "AI-powered assistant that answers questions about curriculum and shows relevant slide pages"
30
- theme = "soft"
31
- share = false
 
 
 
 
 
11
 
12
  [models]
13
  # Preload models for faster startup
14
+ "meta-llama/Meta-Llama-3.1-8B-Instruct" = "llama-3.1-8b"
15
  "sentence-transformers/all-MiniLM-L6-v2" = "all-minilm-l6-v2"
16
 
17
  [datasets]
 
19
 
20
  [hardware]
21
  # Hardware requirements for Gradio
22
+ cpu = "4"
23
+ memory = "16GB"
24
+ disk = "20GB"
25
 
26
  [gradio]
27
  # Gradio specific settings
28
+ title = "AI Curriculum Assistant"
29
+ emoji = "🤖"
30
+ colorFrom = "blue"
31
+ colorTo = "purple"
32
+ sdk = "gradio"
33
+ sdk_version = "4.0.0"
34
+ app_file = "app.py"
35
+ pinned = false
requirements.txt CHANGED
@@ -1,9 +1,10 @@
1
- gradio
2
- langchain
3
- langchain-community
4
- chromadb
5
- sentence-transformers
6
- transformers
7
- torch
8
- PyMuPDF
9
- accelerate
 
 
1
+ gradio>=4.0.0
2
+ PyMuPDF>=1.23.0
3
+ langchain>=0.1.0
4
+ langchain-community>=0.0.20
5
+ sentence-transformers>=2.2.0
6
+ chromadb>=0.4.0
7
+ transformers>=4.35.0
8
+ torch>=2.0.0
9
+ Pillow>=10.0.0
10
+ accelerate>=0.20.0