Spaces:

sangyan5
/

Neural-Assessment-Generator

Build error

File size: 4,269 Bytes

7312afb

from transformers import pipeline
import spacy
import csv
from keybert import KeyBERT # <--- The new Research Component

# =====================
# 1️⃣ Load Models
# =====================
print("Initializing models...")
nlp = spacy.load("en_core_web_sm")
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
kw_model = KeyBERT() # Load the Keyword Extraction Model

# =====================
# 2️⃣ Helper: Get the Best Answer Phrase
# =====================
def get_best_answer_span(sentence):
    """

    Hybrid Approach: 

    1. Use KeyBERT to find the most important WORD.

    2. Use Spacy to find the full NOUN PHRASE containing that word.

    """
    doc = nlp(sentence)
    
    # Step 1: Extract keywords (1-gram only) to find the "topic"
    keywords = kw_model.extract_keywords(
        sentence, 
        keyphrase_ngram_range=(1, 1), # Only look for single words first
        stop_words='english', 
        top_n=1
    )
    
    if not keywords:
        return None
        
    top_keyword = keywords[0][0] # e.g., "tree"

    # Step 2: Find the Noun Chunk that contains this keyword
    # This turns "tree" -> "the big oak tree"
    best_phrase = None
    
    for chunk in doc.noun_chunks:
        if top_keyword in chunk.text:
            best_phrase = chunk.text
            break
            
    # Step 3: Fallback - If the keyword isn't in a noun chunk (rare), use the keyword itself
    if not best_phrase:
        best_phrase = top_keyword
        
    # Clean up (remove leading/trailing punctuation)
    return best_phrase.strip()

# =====================
# 3️⃣ Main Generation Function
# =====================
def generate_question(sentence):
    # Step 1: Get the best answer phrase
    answer = get_best_answer_span(sentence)

    if not answer:
        print(f"⚠️ Skipping: No valid answer found in '{sentence[:20]}...'")
        return None, None, None

    # Step 2: Highlight the answer for T5
    # Note: We use case-insensitive replacement to be safe
    start_idx = sentence.lower().find(answer.lower())
    if start_idx == -1:
        # Fallback if text format differs slightly
        highlighted = f"generate question: {sentence} <hl> {answer} <hl>" 
    else:
        # We slice the sentence to preserve original casing
        highlighted = (
            sentence[:start_idx] + 
            f"<hl> {sentence[start_idx:start_idx+len(answer)]} <hl>" + 
            sentence[start_idx+len(answer):]
        )
        highlighted = f"generate question: {highlighted}"

    # Step 3: Generate Question
    result = qg_pipeline(highlighted, max_new_tokens=64)
    question = result[0]['generated_text']

    print(f"\n🧠 Sentence: {sentence}")
    print(f"🎯 Answer (Phrase): {answer}")
    print(f"❓ Generated Question: {question}")

    return sentence, answer, question

# =====================
# 4️⃣ Process Input File
# =====================
if __name__ == "__main__":
    input_file = "mockSentence.txt"
    output_file = "mockQuestions_v2.csv"

    # Create dummy file if not exists
    import os
    if not os.path.exists(input_file):
        with open(input_file, "w") as f:
            f.write("Machine learning is a field of inquiry devoted to understanding and building methods that learn.\n")
            f.write("Elon Musk founded SpaceX in 2002 with the goal of reducing space transportation costs.\n")

    with open(input_file, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]

    print(f"\nLoaded {len(sentences)} sentences.")

    results = []
    for sent in sentences:
        try:
            s, ans, q = generate_question(sent)
            if s: # Only add if generation was successful
                results.append((s, ans, q))
        except Exception as e:
            print(f"⚠️ Error processing: {sent}\n{e}")

    # Save to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence", "Answer_Phrase", "Generated_Question"])
        writer.writerows(results)

    print(f"\n✅ Done! Check {output_file}")