Neural-Assessment-Generator / Question_Gen3.py
sangyan5's picture
Upload 11 files
7312afb verified
from transformers import pipeline
import spacy
import csv
from keybert import KeyBERT # <--- The new Research Component
# =====================
# 1️⃣ Load Models
# =====================
print("Initializing models...")
nlp = spacy.load("en_core_web_sm")
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
kw_model = KeyBERT() # Load the Keyword Extraction Model
# =====================
# 2️⃣ Helper: Get the Best Answer Phrase
# =====================
def get_best_answer_span(sentence):
"""
Hybrid Approach:
1. Use KeyBERT to find the most important WORD.
2. Use Spacy to find the full NOUN PHRASE containing that word.
"""
doc = nlp(sentence)
# Step 1: Extract keywords (1-gram only) to find the "topic"
keywords = kw_model.extract_keywords(
sentence,
keyphrase_ngram_range=(1, 1), # Only look for single words first
stop_words='english',
top_n=1
)
if not keywords:
return None
top_keyword = keywords[0][0] # e.g., "tree"
# Step 2: Find the Noun Chunk that contains this keyword
# This turns "tree" -> "the big oak tree"
best_phrase = None
for chunk in doc.noun_chunks:
if top_keyword in chunk.text:
best_phrase = chunk.text
break
# Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself
if not best_phrase:
best_phrase = top_keyword
# Clean up (remove leading/trailing punctuation)
return best_phrase.strip()
# =====================
# 3️⃣ Main Generation Function
# =====================
def generate_question(sentence):
# Step 1: Get the best answer phrase
answer = get_best_answer_span(sentence)
if not answer:
print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'")
return None, None, None
# Step 2: Highlight the answer for T5
# Note: We use case-insensitive replacement to be safe
start_idx = sentence.lower().find(answer.lower())
if start_idx == -1:
# Fallback if text format differs slightly
highlighted = f"generate question: {sentence} <hl> {answer} <hl>"
else:
# We slice the sentence to preserve original casing
highlighted = (
sentence[:start_idx] +
f"<hl> {sentence[start_idx:start_idx+len(answer)]} <hl>" +
sentence[start_idx+len(answer):]
)
highlighted = f"generate question: {highlighted}"
# Step 3: Generate Question
result = qg_pipeline(highlighted, max_new_tokens=64)
question = result[0]['generated_text']
print(f"\n🧠 Sentence: {sentence}")
print(f"🎯 Answer (Phrase): {answer}")
print(f"❓ Generated Question: {question}")
return sentence, answer, question
# =====================
# 4️⃣ Process Input File
# =====================
if __name__ == "__main__":
input_file = "mockSentence.txt"
output_file = "mockQuestions_v2.csv"
# Create dummy file if not exists
import os
if not os.path.exists(input_file):
with open(input_file, "w") as f:
f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n")
f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n")
with open(input_file, 'r', encoding='utf-8') as f:
sentences = [line.strip() for line in f if line.strip()]
print(f"\nLoaded {len(sentences)} sentences.")
results = []
for sent in sentences:
try:
s, ans, q = generate_question(sent)
if s: # Only add if generation was successful
results.append((s, ans, q))
except Exception as e:
print(f"⚠️ Error processing: {sent}\n{e}")
# Save to CSV
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"])
writer.writerows(results)
print(f"\n✅ Done! Check {output_file}")