Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from transformers import pipeline | |
| from PyPDF2 import PdfReader | |
| import numpy as np | |
| #Load .env file | |
| load_dotenv() | |
| #Getting the model name from environment (default if not found) | |
| MODEL_NAME = os.getenv("MODEL_NAME", "google/flan-t5-small") | |
| def load_handbook_text(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| return " ".join(page.extract_text() for page in reader.pages if page.extract_text()) | |
| def split_text(text, max_len=800): | |
| words = text.split() | |
| return [" ".join(words[i:i+max_len]) for i in range(0, len(words), max_len)] | |
| def find_relevant_chunk(query, chunks): | |
| # Very lightweight keyword overlap retrieval | |
| q_tokens = set(query.lower().split()) | |
| scores = [len(q_tokens & set(c.lower().split())) for c in chunks] | |
| return chunks[int(np.argmax(scores))] | |
| def answer_question(query, pdf_path="STUDENT-HANDBOOK-2021-EDITION.pdf"): | |
| """ | |
| Answering questions based on the Student Handbook with concise and relevant responses. | |
| """ | |
| text = load_handbook_text(pdf_path) | |
| chunks = split_text(text) | |
| context = find_relevant_chunk(query, chunks) | |
| #Loading the model, which is cached automatically by Hugging Face. | |
| qa = pipeline( | |
| "text2text-generation", | |
| model=MODEL_NAME, | |
| tokenizer=MODEL_NAME | |
| ) | |
| #Improved prompts for more precise answers. | |
| prompt = ( | |
| f"Use only the context below to answer concisely and clearly.\n\n" | |
| f"Question: {query}\n\n" | |
| f"Context:\n{context[:700]}\n\n" | |
| f"Answer in 2-3 sentences only." | |
| ) | |
| result = qa(prompt, max_new_tokens=100, temperature=0.3, num_beams=4) | |
| #Cleaning up the output by trimming the redundant text | |
| answer = result[0]["generated_text"].strip() | |
| return answer | |