Spaces:

rjelbruiz320
/

MidtermExam

Sleeping

App Files Files Community

MidtermExam / utils.py

rjelbruiz320

Update utils.py

8f0847b verified 6 months ago

raw

history blame contribute delete

1.77 kB

	import os
	from dotenv import load_dotenv
	from transformers import pipeline
	from PyPDF2 import PdfReader
	import numpy as np


	#Load .env file
	load_dotenv()

	#Getting the model name from environment (default if not found)
	MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-small")

	def load_handbook_text(pdf_path):
	reader = PdfReader(pdf_path)
	return " ".join(page.extract_text() for page in reader.pages if page.extract_text())

	def split_text(text, max_len=800):
	words = text.split()
	return [" ".join(words[i:i+max_len]) for i in range(0, len(words), max_len)]

	def find_relevant_chunk(query, chunks):
	# Very lightweight keyword overlap retrieval
	q_tokens = set(query.lower().split())
	scores = [len(q_tokens & set(c.lower().split())) for c in chunks]
	return chunks[int(np.argmax(scores))]

	def answer_question(query, pdf_path="STUDENT-HANDBOOK-2021-EDITION.pdf"):
	"""
	Answering questions based on the Student Handbook with concise and relevant responses.
	"""
	text = load_handbook_text(pdf_path)
	chunks = split_text(text)
	context = find_relevant_chunk(query, chunks)

	#Loading the model, which is cached automatically by Hugging Face.
	qa = pipeline(
	"text2text-generation",
	model=MODEL_NAME,
	tokenizer=MODEL_NAME
	)

	#Improved prompts for more precise answers.
	prompt = (
	f"Use only the context below to answer concisely and clearly.\n\n"
	f"Question: {query}\n\n"
	f"Context:\n{context[:700]}\n\n"
	f"Answer in 2-3 sentences only."
	)

	result = qa(prompt, max_new_tokens=100, temperature=0.3, num_beams=4)

	#Cleaning up the output by trimming the redundant text
	answer = result[0]["generated_text"].strip()
	return answer