File size: 4,203 Bytes
4d3516d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import joblib
import pandas as pd
from PIL import Image
import pytesseract
from scipy.sparse import hstack, csr_matrix

print("🔄 Loading model and preprocessors...")

bundle = joblib.load("/Users/vidyasagarkaruturi/Downloads/machine learning/src/document_classifier_xgb.pkl")

model = bundle["model"]
word_vectorizer = bundle["word_vectorizer"]
char_vectorizer = bundle["char_vectorizer"]
scaler = bundle["scaler"]

print("✅ Loaded successfully")

# ===============================
# LABEL MAPPING (Important!)
# ===============================
label_map = {
    0: "advertisement",
    1: "budget",
    2: "email",
    3: "file folder",
    4: "form",
    5: "handwritten",
    6: "invoice",
    7: "letter",
    8: "memo",
    9: "news article",
    10: "presentation",
    11: "questionnaire",
    12: "resume",
    13: "scientific publication",
    14: "scientific report",
    15: "specification"
}

# ===============================
# FEATURE EXTRACTION
# ===============================

def extract_features(text):
    return {
        "char_count": len(text),
        "digit_count": sum(c.isdigit() for c in text),
        "uppercase_count": sum(c.isupper() for c in text),
        "currency_count": text.count("$") + text.count("€"),
        "line_count": text.count("\n"),
    }

# ===============================
# PREDICTION FUNCTION
# ===============================

def predict_document(image_path):

    print(f"\n📄 Running OCR on: {image_path}")

    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    print("text starts -------------------------------------------")
    print(text)
    print("text ends -------------------------------------------")

    # TEXT FEATURES
    word_features = word_vectorizer.transform([text])
    print("word_features starts -------------------------------------------")
    print(word_features)
    print("word_features ends -------------------------------------------")
    char_features = char_vectorizer.transform([text])
    print("char_features starts -------------------------------------------")
    print(char_features)
    print("char_features ends -------------------------------------------")
    text_features = hstack([word_features, char_features])
    print("text_features starts -------------------------------------------")
    print(text_features)
    print("text_features ends -------------------------------------------")

    # NUMERIC FEATURES
    numeric_dict = extract_features(text)
    print("numeric_dict starts -------------------------------------------")
    print(numeric_dict)
    print("numeric_dict ends -------------------------------------------")
    numeric_df = pd.DataFrame([numeric_dict])
    print("numeric_df starts -------------------------------------------")
    print(numeric_df)
    print("numeric_df ends -------------------------------------------")
    numeric_scaled = scaler.transform(numeric_df)
    print("numeric_scaled starts -------------------------------------------")
    print(numeric_scaled)
    print("numeric_scaled ends -------------------------------------------")
    numeric_sparse = csr_matrix(numeric_scaled)
    print("numeric_sparse starts -------------------------------------------")
    print(numeric_sparse)
    print("numeric_sparse ends -------------------------------------------")

    # COMBINE
    final_features = hstack([text_features, numeric_sparse])
    print("final_features starts -------------------------------------------")
    print(final_features)
    print("final_features ends -------------------------------------------")

    # PREDICT
    prediction = model.predict(final_features)[0]
    probability = model.predict_proba(final_features).max()

    print("\n🎯 Prediction:")
    print("Document Type:", label_map[prediction])
    print("Confidence:", round(probability * 100, 2), "%")


# ===============================
# RUN
# ===============================

if __name__ == "__main__":

    # image_path = "/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-31 at 9.25.49 PM.png"
    image_path="/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-13 at 9.20.04 AM.png" 
    predict_document(image_path)