import os # Fix: Redirect Hugging Face cache to a writable folder os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache" os.environ["HF_HOME"] = "/tmp/hf_cache" # New standard from transformers v5+ from transformers import pipeline # Load the model once at the start — this avoids reloading on every request classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # Define the classification labels labels = ["question", "option", "answer", "other"] def extract_mcqs_with_model(text): """ Extract MCQs from a given large body of text using zero-shot classification. Optimized for large documents by batch processing. """ # Clean and split text into meaningful chunks chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()] mcqs = [] current = {"question": "", "options": [], "answer": ""} # Process chunks in batches for speed (e.g., 5 chunks at a time) batch_size = 10 for i in range(0, len(chunks), batch_size): batch = chunks[i:i+batch_size] results = classifier(batch, labels) for chunk, result in zip(batch, results): label = result['labels'][0] if label == "question": if current["question"]: mcqs.append(current) current = {"question": "", "options": [], "answer": ""} current["question"] = chunk elif label == "option": current["options"].append(chunk) elif label == "answer": current["answer"] = chunk if current["question"]: mcqs.append(current) return mcqs