MidtermExam / utils.py
rjelbruiz320's picture
Update utils.py
8f0847b verified
import os
from dotenv import load_dotenv
from transformers import pipeline
from PyPDF2 import PdfReader
import numpy as np
#Load .env file
load_dotenv()
#Getting the model name from environment (default if not found)
MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-small")
def load_handbook_text(pdf_path):
reader = PdfReader(pdf_path)
return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
def split_text(text, max_len=800):
words = text.split()
return [" ".join(words[i:i+max_len]) for i in range(0, len(words), max_len)]
def find_relevant_chunk(query, chunks):
# Very lightweight keyword overlap retrieval
q_tokens = set(query.lower().split())
scores = [len(q_tokens & set(c.lower().split())) for c in chunks]
return chunks[int(np.argmax(scores))]
def answer_question(query, pdf_path="STUDENT-HANDBOOK-2021-EDITION.pdf"):
"""
Answering questions based on the Student Handbook with concise and relevant responses.
"""
text = load_handbook_text(pdf_path)
chunks = split_text(text)
context = find_relevant_chunk(query, chunks)
#Loading the model, which is cached automatically by Hugging Face.
qa = pipeline(
"text2text-generation",
model=MODEL_NAME,
tokenizer=MODEL_NAME
)
#Improved prompts for more precise answers.
prompt = (
f"Use only the context below to answer concisely and clearly.\n\n"
f"Question: {query}\n\n"
f"Context:\n{context[:700]}\n\n"
f"Answer in 2-3 sentences only."
)
result = qa(prompt, max_new_tokens=100, temperature=0.3, num_beams=4)
#Cleaning up the output by trimming the redundant text
answer = result[0]["generated_text"].strip()
return answer