File size: 4,203 Bytes
4d3516d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import joblib
import pandas as pd
from PIL import Image
import pytesseract
from scipy.sparse import hstack, csr_matrix
print("🔄 Loading model and preprocessors...")
bundle = joblib.load("/Users/vidyasagarkaruturi/Downloads/machine learning/src/document_classifier_xgb.pkl")
model = bundle["model"]
word_vectorizer = bundle["word_vectorizer"]
char_vectorizer = bundle["char_vectorizer"]
scaler = bundle["scaler"]
print("✅ Loaded successfully")
# ===============================
# LABEL MAPPING (Important!)
# ===============================
label_map = {
0: "advertisement",
1: "budget",
2: "email",
3: "file folder",
4: "form",
5: "handwritten",
6: "invoice",
7: "letter",
8: "memo",
9: "news article",
10: "presentation",
11: "questionnaire",
12: "resume",
13: "scientific publication",
14: "scientific report",
15: "specification"
}
# ===============================
# FEATURE EXTRACTION
# ===============================
def extract_features(text):
return {
"char_count": len(text),
"digit_count": sum(c.isdigit() for c in text),
"uppercase_count": sum(c.isupper() for c in text),
"currency_count": text.count("$") + text.count("€"),
"line_count": text.count("\n"),
}
# ===============================
# PREDICTION FUNCTION
# ===============================
def predict_document(image_path):
print(f"\n📄 Running OCR on: {image_path}")
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
print("text starts -------------------------------------------")
print(text)
print("text ends -------------------------------------------")
# TEXT FEATURES
word_features = word_vectorizer.transform([text])
print("word_features starts -------------------------------------------")
print(word_features)
print("word_features ends -------------------------------------------")
char_features = char_vectorizer.transform([text])
print("char_features starts -------------------------------------------")
print(char_features)
print("char_features ends -------------------------------------------")
text_features = hstack([word_features, char_features])
print("text_features starts -------------------------------------------")
print(text_features)
print("text_features ends -------------------------------------------")
# NUMERIC FEATURES
numeric_dict = extract_features(text)
print("numeric_dict starts -------------------------------------------")
print(numeric_dict)
print("numeric_dict ends -------------------------------------------")
numeric_df = pd.DataFrame([numeric_dict])
print("numeric_df starts -------------------------------------------")
print(numeric_df)
print("numeric_df ends -------------------------------------------")
numeric_scaled = scaler.transform(numeric_df)
print("numeric_scaled starts -------------------------------------------")
print(numeric_scaled)
print("numeric_scaled ends -------------------------------------------")
numeric_sparse = csr_matrix(numeric_scaled)
print("numeric_sparse starts -------------------------------------------")
print(numeric_sparse)
print("numeric_sparse ends -------------------------------------------")
# COMBINE
final_features = hstack([text_features, numeric_sparse])
print("final_features starts -------------------------------------------")
print(final_features)
print("final_features ends -------------------------------------------")
# PREDICT
prediction = model.predict(final_features)[0]
probability = model.predict_proba(final_features).max()
print("\n🎯 Prediction:")
print("Document Type:", label_map[prediction])
print("Confidence:", round(probability * 100, 2), "%")
# ===============================
# RUN
# ===============================
if __name__ == "__main__":
# image_path = "/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-31 at 9.25.49 PM.png"
image_path="/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-13 at 9.20.04 AM.png"
predict_document(image_path) |