File size: 4,269 Bytes
7312afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from transformers import pipeline
import spacy
import csv
from keybert import KeyBERT # <--- The new Research Component

# =====================
# 1️⃣ Load Models
# =====================
print("Initializing models...")
nlp = spacy.load("en_core_web_sm")
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
kw_model = KeyBERT() # Load the Keyword Extraction Model

# =====================
# 2️⃣ Helper: Get the Best Answer Phrase
# =====================
def get_best_answer_span(sentence):
    """

    Hybrid Approach: 

    1. Use KeyBERT to find the most important WORD.

    2. Use Spacy to find the full NOUN PHRASE containing that word.

    """
    doc = nlp(sentence)
    
    # Step 1: Extract keywords (1-gram only) to find the "topic"
    keywords = kw_model.extract_keywords(
        sentence, 
        keyphrase_ngram_range=(1, 1), # Only look for single words first
        stop_words='english', 
        top_n=1
    )
    
    if not keywords:
        return None
        
    top_keyword = keywords[0][0] # e.g., "tree"

    # Step 2: Find the Noun Chunk that contains this keyword
    # This turns "tree" -> "the big oak tree"
    best_phrase = None
    
    for chunk in doc.noun_chunks:
        if top_keyword in chunk.text:
            best_phrase = chunk.text
            break
            
    # Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself
    if not best_phrase:
        best_phrase = top_keyword
        
    # Clean up (remove leading/trailing punctuation)
    return best_phrase.strip()

# =====================
# 3️⃣ Main Generation Function
# =====================
def generate_question(sentence):
    # Step 1: Get the best answer phrase
    answer = get_best_answer_span(sentence)

    if not answer:
        print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'")
        return None, None, None

    # Step 2: Highlight the answer for T5
    # Note: We use case-insensitive replacement to be safe
    start_idx = sentence.lower().find(answer.lower())
    if start_idx == -1:
        # Fallback if text format differs slightly
        highlighted = f"generate question: {sentence} <hl> {answer} <hl>" 
    else:
        # We slice the sentence to preserve original casing
        highlighted = (
            sentence[:start_idx] + 
            f"<hl> {sentence[start_idx:start_idx+len(answer)]} <hl>" + 
            sentence[start_idx+len(answer):]
        )
        highlighted = f"generate question: {highlighted}"

    # Step 3: Generate Question
    result = qg_pipeline(highlighted, max_new_tokens=64)
    question = result[0]['generated_text']

    print(f"\n🧠 Sentence: {sentence}")
    print(f"🎯 Answer (Phrase): {answer}")
    print(f"❓ Generated Question: {question}")

    return sentence, answer, question

# =====================
# 4️⃣ Process Input File
# =====================
if __name__ == "__main__":
    input_file = "mockSentence.txt"
    output_file = "mockQuestions_v2.csv"

    # Create dummy file if not exists
    import os
    if not os.path.exists(input_file):
        with open(input_file, "w") as f:
            f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n")
            f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n")

    with open(input_file, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]

    print(f"\nLoaded {len(sentences)} sentences.")

    results = []
    for sent in sentences:
        try:
            s, ans, q = generate_question(sent)
            if s: # Only add if generation was successful
                results.append((s, ans, q))
        except Exception as e:
            print(f"⚠️ Error processing: {sent}\n{e}")

    # Save to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"])
        writer.writerows(results)

    print(f"\n✅ Done! Check {output_file}")