Spaces:
Runtime error
Runtime error
File size: 1,658 Bytes
220a5ed 6e691b7 8385a71 220a5ed 5c15e22 8385a71 5c15e22 8385a71 5c15e22 8385a71 5c15e22 8385a71 5c15e22 8385a71 5c15e22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import os
# Fix: Redirect Hugging Face cache to a writable folder
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HOME"] = "/tmp/hf_cache" # New standard from transformers v5+
from transformers import pipeline
# Load the model once at the start — this avoids reloading on every request
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Define the classification labels
labels = ["question", "option", "answer", "other"]
def extract_mcqs_with_model(text):
"""
Extract MCQs from a given large body of text using zero-shot classification.
Optimized for large documents by batch processing.
"""
# Clean and split text into meaningful chunks
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
mcqs = []
current = {"question": "", "options": [], "answer": ""}
# Process chunks in batches for speed (e.g., 5 chunks at a time)
batch_size = 10
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i+batch_size]
results = classifier(batch, labels)
for chunk, result in zip(batch, results):
label = result['labels'][0]
if label == "question":
if current["question"]:
mcqs.append(current)
current = {"question": "", "options": [], "answer": ""}
current["question"] = chunk
elif label == "option":
current["options"].append(chunk)
elif label == "answer":
current["answer"] = chunk
if current["question"]:
mcqs.append(current)
return mcqs
|