Spaces:
Running
Running
| """ | |
| Training Script: Complaint Text Classifier | |
| Trains a TF-IDF + SGDClassifier for categorizing citizen complaints. | |
| Usage: | |
| python ml/train_classifier.py | |
| Output: | |
| ml/weights/classifier.pkl — contains trained model + vectorizer | |
| """ | |
| import os | |
| import sys | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.model_selection import cross_val_score, train_test_split | |
| from sklearn.metrics import classification_report, confusion_matrix | |
| import joblib | |
| # Add parent directory to path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| def load_training_data(): | |
| """Load training data from CSV or generate synthetic data.""" | |
| csv_path = os.path.join(os.path.dirname(__file__), "data", "complaints_train.csv") | |
| if os.path.exists(csv_path): | |
| print(f"Loading training data from {csv_path}...") | |
| df = pd.read_csv(csv_path) | |
| return df["text"].tolist(), df["category"].tolist() | |
| print("No CSV found — generating synthetic training data...") | |
| return generate_synthetic_data() | |
| def generate_synthetic_data(): | |
| """Generate synthetic complaint training data.""" | |
| data = { | |
| "Water Supply": [ | |
| "Water pipe burst on main road near school", | |
| "No water supply for 3 days in our area", | |
| "Drinking water contaminated with brown color", | |
| "Water tanker not arriving since last week", | |
| "Borewell not working in sector 5", | |
| "Leaking pipe flooding the street", | |
| "Low water pressure in apartments", | |
| "Water supply timing changed without notice", | |
| "Sewage water mixing with drinking water", | |
| "Water meter broken need replacement", | |
| "Tap water smells bad since yesterday", | |
| "Underground pipeline leaking near park", | |
| "Water tank overflow causing flooding", | |
| "No hot water in community center", | |
| "Water connection disconnected wrongly", | |
| "Paani ka pipe toot gaya hai gali mein", | |
| "Teen din se paani nahi aa raha", | |
| "Nala overflow ho raha hai ward 7 mein", | |
| "Pipeline burst near children playground", | |
| "Water supply irregular in low income area", | |
| ], | |
| "Roads & Potholes": [ | |
| "Big pothole on highway near toll plaza", | |
| "Road completely damaged after monsoon", | |
| "Speed breaker too high causing accidents", | |
| "Footpath broken and dangerous for walking", | |
| "Road cave-in near bus stop", | |
| "Tar road melting in summer heat", | |
| "No road dividers on busy intersection", | |
| "Construction debris left on road", | |
| "Road not repaired after digging for cable", | |
| "Potholes causing flat tires daily", | |
| "Damaged road near hospital ambulance route", | |
| "Broken speed bumps need repair", | |
| "Road surface cracked after heavy rain", | |
| "Uneven road causing accidents at night", | |
| "Sinkhole forming on main road", | |
| "Sadak toot gayi hai ward 12 mein", | |
| "Gaddhe bahut hain school ke paas", | |
| "Highway pe accident ho raha hai daily", | |
| "Road not asphalted since 2 years", | |
| "Muddy road impassable in monsoon", | |
| ], | |
| "Drainage": [ | |
| "Drainage blocked causing waterlogging", | |
| "Sewer line overflow in residential area", | |
| "Manhole cover missing on main road", | |
| "Stagnant water breeding mosquitoes", | |
| "Flood water not draining since 2 days", | |
| "Open drain near school dangerous", | |
| "Gutter overflowing in front of shop", | |
| "Drain cleaning not done this month", | |
| "Nallah blocked with garbage and plastic", | |
| "Waterlogging during rains destroys crops", | |
| "Drain overflow entering house basement", | |
| "Underground drain pipe cracked", | |
| "Monsoon drainage system non functional", | |
| "Open drain cover is safety hazard", | |
| "Sewage backing up into homes", | |
| "Nallah mein kachra bhara hua hai", | |
| "Barish mein paani ghar mein aa jata hai", | |
| "Nala saaf nahi hua abhi tak", | |
| "Drainage system completely collapsed", | |
| "Waterlogging on bus route daily", | |
| ], | |
| "Electricity": [ | |
| "Streetlight not working for 2 weeks", | |
| "Power outage daily for 6 hours", | |
| "Transformer damaged need replacement", | |
| "Electric pole leaning dangerously", | |
| "Loose wiring hanging from pole", | |
| "Voltage fluctuation damaging appliances", | |
| "Streetlight opposite school broken", | |
| "No electricity connection since registration", | |
| "Power cable fallen on ground", | |
| "Short circuit in public transformer", | |
| "Bijli nahi aa rahi 5 ghante se", | |
| "Street light kharab hai raste pe", | |
| "Transformer mein aag lag gayi", | |
| "Electric pole bent after storm", | |
| "No power backup in hospital area", | |
| "Frequent power cuts in summer", | |
| "Underground cable fault in colony", | |
| "Streetlight timer not working properly", | |
| "Electricity meter showing wrong reading", | |
| "Power outage in critical medical area", | |
| ], | |
| "Garbage & Sanitation": [ | |
| "Garbage not collected for one week", | |
| "Open dump site near residential area", | |
| "Dustbin overflowing with waste", | |
| "No sweeping happening in our street", | |
| "Garbage burning causing air pollution", | |
| "Dead animal on road not removed", | |
| "Public toilet not cleaned properly", | |
| "Solid waste dumped in empty plot", | |
| "Garbage truck not coming to our area", | |
| "Overflowing community dustbin spreading disease", | |
| "Kachra uthaya nahi gaya ek hafta se", | |
| "Safai karmchari nahi aate humare area mein", | |
| "Garbage dump near drinking water source", | |
| "Sweeping machine not deployed in ward", | |
| "Waste segregation not happening properly", | |
| "Trash littered around food market", | |
| "Community bin broken needs replacement", | |
| "Medical waste dumped in regular bin", | |
| "E-waste dumped in public area", | |
| "No garbage pickup on Sunday despite request", | |
| ], | |
| "Safety & Security": [ | |
| "Stray dogs attacking children in colony", | |
| "No street lights making area unsafe at night", | |
| "Illegal construction blocking emergency exit", | |
| "Crime increasing in our neighborhood", | |
| "No CCTV cameras at dangerous intersection", | |
| "Drunk driving accidents happening frequently", | |
| "Abandoned building used for illegal activities", | |
| "No police patrol in our area at night", | |
| "Harassment near women college", | |
| "Fire safety equipment missing in community hall", | |
| "Traffic signals not working at intersection", | |
| "Dangerous animals spotted near village", | |
| "Broken fence allowing trespassers", | |
| "Unsafe pedestrian crossing near school", | |
| "Drug dealing reported in public park", | |
| "Unregistered vehicles parked on footpath", | |
| "Chain snatching incidents increasing", | |
| "Eve teasing near bus stand", | |
| "No safety barriers near construction site", | |
| "Water body without safety fencing", | |
| ], | |
| "Public Health": [ | |
| "Dengue outbreak in our ward", | |
| "Primary health center has no medicines", | |
| "Hospital ward not cleaned properly", | |
| "Vaccination drive cancelled without notice", | |
| "Doctor not available at government clinic", | |
| "Ambulance not responding to emergency calls", | |
| "Food poisoning from community kitchen", | |
| "Mosquito fogging not done this season", | |
| "Stagnant water causing malaria spread", | |
| "No medical facility within 5 km", | |
| "Medicine shortage in PHC for 2 weeks", | |
| "Anganwadi center has no nutrition supply", | |
| "Contaminated food sold near school", | |
| "Mental health services not available", | |
| "TB patients not getting proper treatment", | |
| "Blood bank always out of stock", | |
| "Measles cases rising in slum area", | |
| "No ambulance service for rural patients", | |
| "Hospital overcrowded with no beds", | |
| "Expired medicines distributed at clinic", | |
| ], | |
| } | |
| texts = [] | |
| labels = [] | |
| for category, examples in data.items(): | |
| texts.extend(examples) | |
| labels.extend([category] * len(examples)) | |
| return texts, labels | |
| def train_model(): | |
| """Train and save the complaint classifier.""" | |
| texts, labels = load_training_data() | |
| print(f"\nTraining Data Statistics:") | |
| print(f" Total samples: {len(texts)}") | |
| print(f" Categories: {len(set(labels))}") | |
| for cat in sorted(set(labels)): | |
| count = labels.count(cat) | |
| print(f" {cat}: {count} samples") | |
| # TF-IDF Vectorizer | |
| vectorizer = TfidfVectorizer( | |
| max_features=5000, | |
| ngram_range=(1, 3), | |
| stop_words="english", | |
| sublinear_tf=True, | |
| ) | |
| X = vectorizer.fit_transform(texts) | |
| y = np.array(labels) | |
| # Train-test split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
| # SGD Classifier (fast, good for text) | |
| model = SGDClassifier( | |
| loss="modified_huber", # Gives probability estimates | |
| max_iter=1000, | |
| random_state=42, | |
| class_weight="balanced", | |
| ) | |
| model.fit(X_train, y_train) | |
| # Evaluate | |
| y_pred = model.predict(X_test) | |
| accuracy = (y_pred == y_test).mean() | |
| print(f"\n{'='*50}") | |
| print(f" Model Accuracy: {accuracy:.2%}") | |
| print(f"{'='*50}\n") | |
| print("Classification Report:") | |
| print(classification_report(y_test, y_pred)) | |
| # Cross-validation | |
| cv_scores = cross_val_score(model, X, y, cv=5, scoring="accuracy") | |
| print(f"\nCross-validation: {cv_scores.mean():.2%} (±{cv_scores.std():.2%})") | |
| # Save model | |
| output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ml", "weights") | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_path = os.path.join(output_dir, "classifier.pkl") | |
| bundle = { | |
| "model": model, | |
| "vectorizer": vectorizer, | |
| "categories": sorted(set(labels)), | |
| "accuracy": accuracy, | |
| } | |
| joblib.dump(bundle, output_path) | |
| print(f"\n✅ Model saved to: {output_path}") | |
| print(f" Accuracy: {accuracy:.2%}") | |
| print(f" Categories: {len(set(labels))}") | |
| return model, vectorizer | |
| if __name__ == "__main__": | |
| train_model() | |