File size: 1,658 Bytes
220a5ed
6e691b7
 
 
8385a71
220a5ed
5c15e22
 
8385a71
5c15e22
 
8385a71
5c15e22
 
 
8385a71
 
 
 
 
5c15e22
 
 
 
8385a71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c15e22
 
 
8385a71
5c15e22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os

# Fix: Redirect Hugging Face cache to a writable folder
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HOME"] = "/tmp/hf_cache"  # New standard from transformers v5+

from transformers import pipeline

# Load the model once at the start — this avoids reloading on every request
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the classification labels
labels = ["question", "option", "answer", "other"]

def extract_mcqs_with_model(text):
    """
    Extract MCQs from a given large body of text using zero-shot classification.
    Optimized for large documents by batch processing.
    """
    # Clean and split text into meaningful chunks
    chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
    mcqs = []
    current = {"question": "", "options": [], "answer": ""}

    # Process chunks in batches for speed (e.g., 5 chunks at a time)
    batch_size = 10
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        results = classifier(batch, labels)

        for chunk, result in zip(batch, results):
            label = result['labels'][0]
            if label == "question":
                if current["question"]:
                    mcqs.append(current)
                    current = {"question": "", "options": [], "answer": ""}
                current["question"] = chunk
            elif label == "option":
                current["options"].append(chunk)
            elif label == "answer":
                current["answer"] = chunk

    if current["question"]:
        mcqs.append(current)

    return mcqs