Spaces:

sangyan5
/

Neural-Assessment-Generator

Build error

App Files Files Community

Neural-Assessment-Generator / Question_Gen3.py

sangyan5

Upload 11 files

7312afb verified 26 days ago

raw

history blame contribute delete

4.27 kB

	from transformers import pipeline
	import spacy
	import csv
	from keybert import KeyBERT # <--- The new Research Component

	# =====================
	# 1️⃣ Load Models
	# =====================
	print("Initializing models...")
	nlp = spacy.load("en_core_web_sm")
	qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
	kw_model = KeyBERT() # Load the Keyword Extraction Model

	# =====================
	# 2️⃣ Helper: Get the Best Answer Phrase
	# =====================
	def get_best_answer_span(sentence):
	"""
	Hybrid Approach:
	1. Use KeyBERT to find the most important WORD.
	2. Use Spacy to find the full NOUN PHRASE containing that word.
	"""
	doc = nlp(sentence)

	# Step 1: Extract keywords (1-gram only) to find the "topic"
	keywords = kw_model.extract_keywords(
	sentence,
	keyphrase_ngram_range=(1, 1), # Only look for single words first
	stop_words='english',
	top_n=1
	)

	if not keywords:
	return None

	top_keyword = keywords[0][0] # e.g., "tree"

	# Step 2: Find the Noun Chunk that contains this keyword
	# This turns "tree" -> "the big oak tree"
	best_phrase = None

	for chunk in doc.noun_chunks:
	if top_keyword in chunk.text:
	best_phrase = chunk.text
	break

	# Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself
	if not best_phrase:
	best_phrase = top_keyword

	# Clean up (remove leading/trailing punctuation)
	return best_phrase.strip()

	# =====================
	# 3️⃣ Main Generation Function
	# =====================
	def generate_question(sentence):
	# Step 1: Get the best answer phrase
	answer = get_best_answer_span(sentence)

	if not answer:
	print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'")
	return None, None, None

	# Step 2: Highlight the answer for T5
	# Note: We use case-insensitive replacement to be safe
	start_idx = sentence.lower().find(answer.lower())
	if start_idx == -1:
	# Fallback if text format differs slightly
	highlighted = f"generate question: {sentence} <hl> {answer} <hl>"
	else:
	# We slice the sentence to preserve original casing
	highlighted = (
	sentence[:start_idx] +
	f"<hl> {sentence[start_idx:start_idx+len(answer)]} <hl>" +
	sentence[start_idx+len(answer):]
	)
	highlighted = f"generate question: {highlighted}"

	# Step 3: Generate Question
	result = qg_pipeline(highlighted, max_new_tokens=64)
	question = result[0]['generated_text']

	print(f"\n🧠 Sentence: {sentence}")
	print(f"🎯 Answer (Phrase): {answer}")
	print(f"❓ Generated Question: {question}")

	return sentence, answer, question

	# =====================
	# 4️⃣ Process Input File
	# =====================
	if __name__ == "__main__":
	input_file = "mockSentence.txt"
	output_file = "mockQuestions_v2.csv"

	# Create dummy file if not exists
	import os
	if not os.path.exists(input_file):
	with open(input_file, "w") as f:
	f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n")
	f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n")

	with open(input_file, 'r', encoding='utf-8') as f:
	sentences = [line.strip() for line in f if line.strip()]

	print(f"\nLoaded {len(sentences)} sentences.")

	results = []
	for sent in sentences:
	try:
	s, ans, q = generate_question(sent)
	if s: # Only add if generation was successful
	results.append((s, ans, q))
	except Exception as e:
	print(f"⚠️ Error processing: {sent}\n{e}")

	# Save to CSV
	with open(output_file, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"])
	writer.writerows(results)

	print(f"\n✅ Done! Check {output_file}")