Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from imblearn.over_sampling import RandomOverSampler | |
| def load_model_and_tokenizer(): | |
| tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert") | |
| model = AutoModel.from_pretrained("ai4bharat/indic-bert") | |
| return tokenizer, model | |
| def get_embeddings(texts, tokenizer, model): | |
| inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state[:, 0, :] # CLS token | |
| return embeddings | |
| def load_data(): | |
| df = pd.read_csv("SushasanSampleData.csv", encoding="utf-8") | |
| df['applicationDetail'] = df['applicationDetail'].fillna("") | |
| df['applicationCategoryName'] = df['applicationCategoryName'].fillna("अन्य") | |
| return df | |
| def preprocess_and_train(df): | |
| tokenizer, model = load_model_and_tokenizer() | |
| text_embeddings = get_embeddings(df['applicationDetail'].tolist(), tokenizer, model) | |
| text_embeddings = text_embeddings.cpu().numpy() | |
| label_encoder = LabelEncoder() | |
| labels = label_encoder.fit_transform(df['applicationCategoryName']) | |
| ros = RandomOverSampler(random_state=42) | |
| X_resampled, y_resampled = ros.fit_resample(text_embeddings, labels) | |
| '''X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)''' | |
| X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled) | |
| clf = LogisticRegression(max_iter=1000) | |
| clf.fit(X_train, y_train) | |
| return clf, tokenizer, model, label_encoder | |
| df = load_data() | |
| clf, tokenizer, model, label_encoder = preprocess_and_train(df) | |
| # Streamlit UI | |
| st.title("🇮🇳 Hindi Category Classifier (IndicBERT Powered)") | |
| user_input = st.text_area("✍️ Enter Application Detail", "") | |
| if st.button("🔍 Predict"): | |
| if user_input.strip() == "": | |
| st.warning("Please write something.") | |
| else: | |
| user_emb = get_embeddings([user_input], tokenizer, model) | |
| user_emb = user_emb.cpu().numpy() | |
| prediction = clf.predict(user_emb) | |
| label = label_encoder.inverse_transform(prediction)[0] | |
| st.success(f"🧠 Predicted Category: **{label}**") | |