document-classifier-xgb / predict_document.py
vidyasagar786's picture
Upload predict_document.py with huggingface_hub
4d3516d verified
import joblib
import pandas as pd
from PIL import Image
import pytesseract
from scipy.sparse import hstack, csr_matrix
print("🔄 Loading model and preprocessors...")
bundle = joblib.load("/Users/vidyasagarkaruturi/Downloads/machine learning/src/document_classifier_xgb.pkl")
model = bundle["model"]
word_vectorizer = bundle["word_vectorizer"]
char_vectorizer = bundle["char_vectorizer"]
scaler = bundle["scaler"]
print("✅ Loaded successfully")
# ===============================
# LABEL MAPPING (Important!)
# ===============================
label_map = {
0: "advertisement",
1: "budget",
2: "email",
3: "file folder",
4: "form",
5: "handwritten",
6: "invoice",
7: "letter",
8: "memo",
9: "news article",
10: "presentation",
11: "questionnaire",
12: "resume",
13: "scientific publication",
14: "scientific report",
15: "specification"
}
# ===============================
# FEATURE EXTRACTION
# ===============================
def extract_features(text):
return {
"char_count": len(text),
"digit_count": sum(c.isdigit() for c in text),
"uppercase_count": sum(c.isupper() for c in text),
"currency_count": text.count("$") + text.count("€"),
"line_count": text.count("\n"),
}
# ===============================
# PREDICTION FUNCTION
# ===============================
def predict_document(image_path):
print(f"\n📄 Running OCR on: {image_path}")
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
print("text starts -------------------------------------------")
print(text)
print("text ends -------------------------------------------")
# TEXT FEATURES
word_features = word_vectorizer.transform([text])
print("word_features starts -------------------------------------------")
print(word_features)
print("word_features ends -------------------------------------------")
char_features = char_vectorizer.transform([text])
print("char_features starts -------------------------------------------")
print(char_features)
print("char_features ends -------------------------------------------")
text_features = hstack([word_features, char_features])
print("text_features starts -------------------------------------------")
print(text_features)
print("text_features ends -------------------------------------------")
# NUMERIC FEATURES
numeric_dict = extract_features(text)
print("numeric_dict starts -------------------------------------------")
print(numeric_dict)
print("numeric_dict ends -------------------------------------------")
numeric_df = pd.DataFrame([numeric_dict])
print("numeric_df starts -------------------------------------------")
print(numeric_df)
print("numeric_df ends -------------------------------------------")
numeric_scaled = scaler.transform(numeric_df)
print("numeric_scaled starts -------------------------------------------")
print(numeric_scaled)
print("numeric_scaled ends -------------------------------------------")
numeric_sparse = csr_matrix(numeric_scaled)
print("numeric_sparse starts -------------------------------------------")
print(numeric_sparse)
print("numeric_sparse ends -------------------------------------------")
# COMBINE
final_features = hstack([text_features, numeric_sparse])
print("final_features starts -------------------------------------------")
print(final_features)
print("final_features ends -------------------------------------------")
# PREDICT
prediction = model.predict(final_features)[0]
probability = model.predict_proba(final_features).max()
print("\n🎯 Prediction:")
print("Document Type:", label_map[prediction])
print("Confidence:", round(probability * 100, 2), "%")
# ===============================
# RUN
# ===============================
if __name__ == "__main__":
# image_path = "/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-31 at 9.25.49 PM.png"
image_path="/Users/vidyasagarkaruturi/Desktop/Screenshot 2025-10-13 at 9.20.04 AM.png"
predict_document(image_path)