import joblib import pandas as pd from PIL import Image import pytesseract from scipy.sparse import hstack, csr_matrix print("🔄 Loading model and preprocessors...") bundle = joblib.load("/Users/vidyasagarkaruturi/Downloads/machine learning/src/document_classifier_xgb.pkl") model = bundle["model"] word_vectorizer = bundle["word_vectorizer"] char_vectorizer = bundle["char_vectorizer"] scaler = bundle["scaler"] print("✅ Loaded successfully") # =============================== # LABEL MAPPING (Important!) # =============================== label_map = { 0: "advertisement", 1: "budget", 2: "email", 3: "file folder", 4: "form", 5: "handwritten", 6: "invoice", 7: "letter", 8: "memo", 9: "news article", 10: "presentation", 11: "questionnaire", 12: "resume", 13: "scientific publication", 14: "scientific report", 15: "specification" } # =============================== # FEATURE EXTRACTION # =============================== def extract_features(text): return { "char_count": len(text), "digit_count": sum(c.isdigit() for c in text), "uppercase_count": sum(c.isupper() for c in text), "currency_count": text.count("$") + text.count("€"), "line_count": text.count("\n"), } # =============================== # PREDICTION FUNCTION # =============================== def predict_document(image_path): print(f"\n📄 Running OCR on: {image_path}") img = Image.open(image_path) text = pytesseract.image_to_string(img) print("text starts -------------------------------------------") print(text) print("text ends -------------------------------------------") # TEXT FEATURES word_features = word_vectorizer.transform([text]) print("word_features starts -------------------------------------------") print(word_features) print("word_features ends -------------------------------------------") char_features = char_vectorizer.transform([text]) print("char_features starts -------------------------------------------") print(char_features) print("char_features ends -------------------------------------------") text_features = hstack([word_features, char_features]) print("text_features starts -------------------------------------------") print(text_features) print("text_features ends -------------------------------------------") # NUMERIC FEATURES numeric_dict = extract_features(text) print("numeric_dict starts -------------------------------------------") print(numeric_dict) print("numeric_dict ends -------------------------------------------") numeric_df = pd.DataFrame([numeric_dict]) print("numeric_df starts -------------------------------------------") print(numeric_df) print("numeric_df ends -------------------------------------------") numeric_scaled = scaler.transform(numeric_df) print("numeric_scaled starts -------------------------------------------") print(numeric_scaled) print("numeric_scaled ends -------------------------------------------") numeric_sparse = csr_matrix(numeric_scaled) print("numeric_sparse starts -------------------------------------------") print(numeric_sparse) print("numeric_sparse ends -------------------------------------------") # COMBINE final_features = hstack([text_features, numeric_sparse]) print("final_features starts -------------------------------------------") print(final_features) print("final_features ends -------------------------------------------") # PREDICT prediction = model.predict(final_features)[0] probability = model.predict_proba(final_features).max() print("\n🎯 Prediction:") print("Document Type:", label_map[prediction]) print("Confidence:", round(probability * 100, 2), "%") # =============================== # RUN # =============================== if __name__ == "__main__": # image_path = "/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-31 at 9.25.49 PM.png" image_path="/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-13 at 9.20.04 AM.png" predict_document(image_path)