import fitz # pymupdf import re from typing import List from transformers import pipeline import torch PDF_FILE_PATH = "module_b/file_2.pdf" def extract_nepali_sentences_from_pdf(pdf_path: str) -> List[str]: """ Extracts clean Nepali sentences from a searchable PDF using PyMuPDF. """ print(f"Opening PDF: {pdf_path}") doc = fitz.open(pdf_path) full_text = "" for page in doc: text = page.get_text("text") full_text += text + "\n" doc.close() if not full_text.strip(): print("Warning: No text found. PDF might be scanned (image-based). Use OCR version instead.") return [] # Clean whitespace text = full_text.replace('\n', ' ') text = re.sub(r'\s+', ' ', text).strip() # Split sentences intelligently sentences = re.split(r'(?<=[।.!?])\s+(?=[अ-हँ-ॿअ-ह])|(?<=[।.!?])(?=$)', text) if len(sentences) <= 1: # fallback sentences = re.split(r'(?<=[।.!?])\s+', text) # Final cleaning cleaned = [s.strip(' ।.!?').strip() for s in sentences if len(s.strip()) > 5] print(f"Successfully extracted {len(cleaned)} clean sentences.\n") return cleaned print("Loading your model from Hugging Face...") model_name = "sangy1212/distilbert-base-nepali-fine-tuned" classifier = pipeline( "text-classification", model=model_name, tokenizer=model_name, device=0 if torch.cuda.is_available() else -1, batch_size=16 ) print("Model loaded and ready!\n") id_to_label = { "LABEL_0": "neutral", "LABEL_1": "gender", "LABEL_2": "religional", "LABEL_3": "caste", "LABEL_4": "religion", "LABEL_5": "appearence", "LABEL_6": "socialstatus", "LABEL_7": "amiguity", "LABEL_8": "political", "LABEL_9": "Age", "LABEL_10": "Disablity" } def predict_bias_on_sentences(sentences: List[str], confidence_threshold: float = 0.7): """ Runs batch prediction and prints results with nice formatting. """ if not sentences: print("No sentences to analyze.") return print(f"Running bias detection on {len(sentences)} sentences...\n") # Batch inference results = classifier(sentences) print("="*100) print("BIAS DETECTION RESULTS") print("="*100) biased_count = 0 for sent, res in zip(sentences, results): label_id = res['label'] category = id_to_label.get(label_id, "unknown") confidence = res['score'] if category != "neutral" and confidence >= confidence_threshold: mark = " BIAS DETECTED" biased_count += 1 else: mark = "✓ neutral / low confidence" print(f"{mark}") print(f" Category : {category.upper()}") print(f" Confidence : {confidence:.3f}") print(f" Sentence : {sent}") print("-" * 80) print(f"\nSummary: {biased_count}/{len(sentences)} sentences contain detectable bias (confidence ≥ {confidence_threshold})") if __name__ == "__main__": pdf_file_path = PDF_FILE_PATH import os if os.path.exists(pdf_file_path): print(f"Using PDF file at: {pdf_file_path}\n") else: print(f"PDF file not found at: {pdf_file_path}. Please check the path.") exit(1) # Step 1: Extract sentences sentences = extract_nepali_sentences_from_pdf(pdf_file_path) # Step 2: Run batch prediction if sentences: predict_bias_on_sentences(sentences, confidence_threshold=0.7) print("\nDone! Your bias detection is complete.")