from transformers import pipeline import spacy import csv from keybert import KeyBERT # <--- The new Research Component # ===================== # 1️⃣ Load Models # ===================== print("Initializing models...") nlp = spacy.load("en_core_web_sm") qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl") kw_model = KeyBERT() # Load the Keyword Extraction Model # ===================== # 2️⃣ Helper: Get the Best Answer Phrase # ===================== def get_best_answer_span(sentence): """ Hybrid Approach: 1. Use KeyBERT to find the most important WORD. 2. Use Spacy to find the full NOUN PHRASE containing that word. """ doc = nlp(sentence) # Step 1: Extract keywords (1-gram only) to find the "topic" keywords = kw_model.extract_keywords( sentence, keyphrase_ngram_range=(1, 1), # Only look for single words first stop_words='english', top_n=1 ) if not keywords: return None top_keyword = keywords[0][0] # e.g., "tree" # Step 2: Find the Noun Chunk that contains this keyword # This turns "tree" -> "the big oak tree" best_phrase = None for chunk in doc.noun_chunks: if top_keyword in chunk.text: best_phrase = chunk.text break # Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself if not best_phrase: best_phrase = top_keyword # Clean up (remove leading/trailing punctuation) return best_phrase.strip() # ===================== # 3️⃣ Main Generation Function # ===================== def generate_question(sentence): # Step 1: Get the best answer phrase answer = get_best_answer_span(sentence) if not answer: print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'") return None, None, None # Step 2: Highlight the answer for T5 # Note: We use case-insensitive replacement to be safe start_idx = sentence.lower().find(answer.lower()) if start_idx == -1: # Fallback if text format differs slightly highlighted = f"generate question: {sentence} {answer} " else: # We slice the sentence to preserve original casing highlighted = ( sentence[:start_idx] + f" {sentence[start_idx:start_idx+len(answer)]} " + sentence[start_idx+len(answer):] ) highlighted = f"generate question: {highlighted}" # Step 3: Generate Question result = qg_pipeline(highlighted, max_new_tokens=64) question = result[0]['generated_text'] print(f"\n🧠 Sentence: {sentence}") print(f"🎯 Answer (Phrase): {answer}") print(f"❓ Generated Question: {question}") return sentence, answer, question # ===================== # 4️⃣ Process Input File # ===================== if __name__ == "__main__": input_file = "mockSentence.txt" output_file = "mockQuestions_v2.csv" # Create dummy file if not exists import os if not os.path.exists(input_file): with open(input_file, "w") as f: f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n") f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n") with open(input_file, 'r', encoding='utf-8') as f: sentences = [line.strip() for line in f if line.strip()] print(f"\nLoaded {len(sentences)} sentences.") results = [] for sent in sentences: try: s, ans, q = generate_question(sent) if s: # Only add if generation was successful results.append((s, ans, q)) except Exception as e: print(f"⚠️ Error processing: {sent}\n{e}") # Save to CSV with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"]) writer.writerows(results) print(f"\n✅ Done! Check {output_file}")