Spaces:
Build error
Build error
| from transformers import pipeline | |
| import spacy | |
| import csv | |
| from keybert import KeyBERT # <--- The new Research Component | |
| # ===================== | |
| # 1️⃣ Load Models | |
| # ===================== | |
| print("Initializing models...") | |
| nlp = spacy.load("en_core_web_sm") | |
| qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl") | |
| kw_model = KeyBERT() # Load the Keyword Extraction Model | |
| # ===================== | |
| # 2️⃣ Helper: Get the Best Answer Phrase | |
| # ===================== | |
| def get_best_answer_span(sentence): | |
| """ | |
| Hybrid Approach: | |
| 1. Use KeyBERT to find the most important WORD. | |
| 2. Use Spacy to find the full NOUN PHRASE containing that word. | |
| """ | |
| doc = nlp(sentence) | |
| # Step 1: Extract keywords (1-gram only) to find the "topic" | |
| keywords = kw_model.extract_keywords( | |
| sentence, | |
| keyphrase_ngram_range=(1, 1), # Only look for single words first | |
| stop_words='english', | |
| top_n=1 | |
| ) | |
| if not keywords: | |
| return None | |
| top_keyword = keywords[0][0] # e.g., "tree" | |
| # Step 2: Find the Noun Chunk that contains this keyword | |
| # This turns "tree" -> "the big oak tree" | |
| best_phrase = None | |
| for chunk in doc.noun_chunks: | |
| if top_keyword in chunk.text: | |
| best_phrase = chunk.text | |
| break | |
| # Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself | |
| if not best_phrase: | |
| best_phrase = top_keyword | |
| # Clean up (remove leading/trailing punctuation) | |
| return best_phrase.strip() | |
| # ===================== | |
| # 3️⃣ Main Generation Function | |
| # ===================== | |
| def generate_question(sentence): | |
| # Step 1: Get the best answer phrase | |
| answer = get_best_answer_span(sentence) | |
| if not answer: | |
| print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'") | |
| return None, None, None | |
| # Step 2: Highlight the answer for T5 | |
| # Note: We use case-insensitive replacement to be safe | |
| start_idx = sentence.lower().find(answer.lower()) | |
| if start_idx == -1: | |
| # Fallback if text format differs slightly | |
| highlighted = f"generate question: {sentence} <hl> {answer} <hl>" | |
| else: | |
| # We slice the sentence to preserve original casing | |
| highlighted = ( | |
| sentence[:start_idx] + | |
| f"<hl> {sentence[start_idx:start_idx+len(answer)]} <hl>" + | |
| sentence[start_idx+len(answer):] | |
| ) | |
| highlighted = f"generate question: {highlighted}" | |
| # Step 3: Generate Question | |
| result = qg_pipeline(highlighted, max_new_tokens=64) | |
| question = result[0]['generated_text'] | |
| print(f"\n🧠 Sentence: {sentence}") | |
| print(f"🎯 Answer (Phrase): {answer}") | |
| print(f"❓ Generated Question: {question}") | |
| return sentence, answer, question | |
| # ===================== | |
| # 4️⃣ Process Input File | |
| # ===================== | |
| if __name__ == "__main__": | |
| input_file = "mockSentence.txt" | |
| output_file = "mockQuestions_v2.csv" | |
| # Create dummy file if not exists | |
| import os | |
| if not os.path.exists(input_file): | |
| with open(input_file, "w") as f: | |
| f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n") | |
| f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n") | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| sentences = [line.strip() for line in f if line.strip()] | |
| print(f"\nLoaded {len(sentences)} sentences.") | |
| results = [] | |
| for sent in sentences: | |
| try: | |
| s, ans, q = generate_question(sent) | |
| if s: # Only add if generation was successful | |
| results.append((s, ans, q)) | |
| except Exception as e: | |
| print(f"⚠️ Error processing: {sent}\n{e}") | |
| # Save to CSV | |
| with open(output_file, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"]) | |
| writer.writerows(results) | |
| print(f"\n✅ Done! Check {output_file}") |