Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -38,10 +38,12 @@ class CurriculumAssistant:
|
|
| 38 |
"text-generation",
|
| 39 |
model=model,
|
| 40 |
tokenizer=tokenizer,
|
| 41 |
-
max_new_tokens=256
|
| 42 |
temperature=0.7,
|
| 43 |
top_p=0.95,
|
| 44 |
-
repetition_penalty=1.15
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
|
@@ -101,10 +103,10 @@ class CurriculumAssistant:
|
|
| 101 |
print("No text could be extracted from PDF files!")
|
| 102 |
return False
|
| 103 |
|
| 104 |
-
# Split text into chunks with metadata
|
| 105 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 106 |
-
chunk_size=1000
|
| 107 |
-
chunk_overlap=200
|
| 108 |
length_function=len,
|
| 109 |
)
|
| 110 |
|
|
@@ -149,21 +151,17 @@ class CurriculumAssistant:
|
|
| 149 |
if not self.vector_db or not self.llm:
|
| 150 |
return False
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
qa_template = """
|
| 154 |
-
Use the following context to answer the student's question. If the information is not in the context,
|
| 155 |
-
provide a helpful response based on your knowledge of programming concepts.
|
| 156 |
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
Answer:"""
|
| 162 |
|
| 163 |
self.qa_chain = RetrievalQA.from_chain_type(
|
| 164 |
llm=self.llm,
|
| 165 |
chain_type="stuff",
|
| 166 |
-
retriever=self.vector_db.as_retriever(search_kwargs={"k":
|
| 167 |
chain_type_kwargs={
|
| 168 |
"prompt": PromptTemplate(
|
| 169 |
template=qa_template,
|
|
@@ -178,7 +176,7 @@ class CurriculumAssistant:
|
|
| 178 |
"""Find relevant pages for a given question"""
|
| 179 |
try:
|
| 180 |
# Search for relevant chunks
|
| 181 |
-
results = self.vector_db.similarity_search(question, k=5
|
| 182 |
|
| 183 |
relevant_pages = []
|
| 184 |
seen_pages = set()
|
|
@@ -215,7 +213,7 @@ class CurriculumAssistant:
|
|
| 215 |
|
| 216 |
# Sort by relevance and return top results
|
| 217 |
relevant_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
|
| 218 |
-
return relevant_pages[:
|
| 219 |
|
| 220 |
except Exception as e:
|
| 221 |
print(f"Error finding relevant pages: {str(e)}")
|
|
@@ -257,7 +255,7 @@ def ask_question(question: str, assistant: CurriculumAssistant):
|
|
| 257 |
page_info = "📄 **Relevant Pages Found:**\n\n"
|
| 258 |
for i, page in enumerate(relevant_pages, 1):
|
| 259 |
page_info += f"**{i}. {page['filename']} - Page {page['page_number']}**\n"
|
| 260 |
-
page_info += f"```\n{page['content'][:
|
| 261 |
else:
|
| 262 |
page_info = "No specific pages found for this question."
|
| 263 |
|
|
|
|
| 38 |
"text-generation",
|
| 39 |
model=model,
|
| 40 |
tokenizer=tokenizer,
|
| 41 |
+
max_new_tokens=128, # Reduced from 256
|
| 42 |
temperature=0.7,
|
| 43 |
top_p=0.95,
|
| 44 |
+
repetition_penalty=1.15,
|
| 45 |
+
do_sample=True,
|
| 46 |
+
pad_token_id=tokenizer.eos_token_id
|
| 47 |
)
|
| 48 |
|
| 49 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
|
|
|
| 103 |
print("No text could be extracted from PDF files!")
|
| 104 |
return False
|
| 105 |
|
| 106 |
+
# Split text into smaller chunks with metadata
|
| 107 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 108 |
+
chunk_size=500, # Reduced from 1000
|
| 109 |
+
chunk_overlap=50, # Reduced from 200
|
| 110 |
length_function=len,
|
| 111 |
)
|
| 112 |
|
|
|
|
| 151 |
if not self.vector_db or not self.llm:
|
| 152 |
return False
|
| 153 |
|
| 154 |
+
# Shorter prompt template for DialoGPT
|
| 155 |
+
qa_template = """Context: {context}
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
Question: {question}
|
| 158 |
|
| 159 |
+
Answer:"""
|
|
|
|
|
|
|
| 160 |
|
| 161 |
self.qa_chain = RetrievalQA.from_chain_type(
|
| 162 |
llm=self.llm,
|
| 163 |
chain_type="stuff",
|
| 164 |
+
retriever=self.vector_db.as_retriever(search_kwargs={"k": 2}), # Reduced from 5
|
| 165 |
chain_type_kwargs={
|
| 166 |
"prompt": PromptTemplate(
|
| 167 |
template=qa_template,
|
|
|
|
| 176 |
"""Find relevant pages for a given question"""
|
| 177 |
try:
|
| 178 |
# Search for relevant chunks
|
| 179 |
+
results = self.vector_db.similarity_search(question, k=3) # Reduced from 5
|
| 180 |
|
| 181 |
relevant_pages = []
|
| 182 |
seen_pages = set()
|
|
|
|
| 213 |
|
| 214 |
# Sort by relevance and return top results
|
| 215 |
relevant_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
|
| 216 |
+
return relevant_pages[:2] # Reduced from 3
|
| 217 |
|
| 218 |
except Exception as e:
|
| 219 |
print(f"Error finding relevant pages: {str(e)}")
|
|
|
|
| 255 |
page_info = "📄 **Relevant Pages Found:**\n\n"
|
| 256 |
for i, page in enumerate(relevant_pages, 1):
|
| 257 |
page_info += f"**{i}. {page['filename']} - Page {page['page_number']}**\n"
|
| 258 |
+
page_info += f"```\n{page['content'][:200]}...\n```\n\n" # Reduced from 300
|
| 259 |
else:
|
| 260 |
page_info = "No specific pages found for this question."
|
| 261 |
|