| |
| """app.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1Bli_bGuux1CJr22uJYxsoLSQkr5LjXvD |
| """ |
|
|
| import random |
| import pandas as pd |
|
|
| |
| categories = { |
| "Garbage": [ |
| "Garbage not collected", |
| "Trash piled up", |
| "Waste scattered everywhere", |
| "Debris dumped carelessly", |
| "Rubbish overflowing", |
| "Litter causing bad smell", |
| "Uncollected scrap lying around", |
| "Filth spread all over", |
| "Junk thrown carelessly", |
| "Refuse dumped openly", |
| "Garbage heap blocking the way", |
| "Dumping ground overflowing" |
| ], |
| "Water": [ |
| "Water pipeline leaking", |
| "No water supply", |
| "Contaminated tap water", |
| "Low water pressure", |
| "Water tanker not arrived", |
| "Sewage water overflow", |
| "Drainage issue", |
| "Sewer blockage reported", |
| "Flooding due to heavy rain", |
| "Water logging problem", |
| "Dirty water flowing", |
| "Burst pipeline issue" |
| ], |
| "Roads": [ |
| "Big pothole on the road", |
| "Damaged road surface", |
| "Cracks on the road", |
| "Uneven surface making driving difficult", |
| "Broken speed breaker", |
| "Debris blocking the road", |
| "Manhole cover missing", |
| "Broken pavement", |
| "Damaged footpath", |
| "Road erosion reported", |
| "Construction waste dumped on road", |
| "Street blocked due to cave-in" |
| ], |
| "Electricity": [ |
| |
| "Frequent power cuts", |
| "Load shedding problem", |
| "Voltage fluctuation issue", |
| "Transformer not working", |
| "Wire hanging dangerously", |
| "No electricity supply", |
| "Complete blackout", |
| "Short circuit issue reported", |
| "Electrical failure in houses", |
| "Electric spark observed", |
| |
| "Streetlight not working", |
| "Streetlight bulb fused", |
| "Dark area due to broken streetlight", |
| "Streetlight flickering", |
| "Streetlight pole damaged", |
| "Entire lane dark without lights" |
| ] |
| } |
|
|
| |
| num_samples = 300 |
| data = [] |
|
|
| for category, templates in categories.items(): |
| for _ in range(num_samples): |
| template = random.choice(templates) |
| data.append({ |
| "Complaint Text": template, |
| "Category": category |
| }) |
|
|
| |
| df = pd.DataFrame(data) |
|
|
| |
| df = df.sample(frac=1).reset_index(drop=True) |
|
|
| |
| df.to_csv("synthetic_civic_complaints_no_location.csv", index=False, encoding="utf-8") |
|
|
| print("โ
Final synonym-rich dataset created: synthetic_civic_complaints_no_location.csv") |
| display(df.head()) |
|
|
| import pandas as pd |
| from sklearn.model_selection import train_test_split, cross_val_score, learning_curve |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay |
| import matplotlib.pyplot as plt |
| import numpy as np |
|
|
| |
| df = pd.read_csv("synthetic_civic_complaints_rich.csv") |
|
|
| |
| df["Complaint Text"] = df["Complaint Text"].str.lower() |
|
|
| |
| X = df["Complaint Text"] |
| y = df["Category"] |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| |
| vectorizer = TfidfVectorizer(stop_words="english", max_features=5000) |
| X_train_vec = vectorizer.fit_transform(X_train) |
| X_test_vec = vectorizer.transform(X_test) |
|
|
| clf = LogisticRegression(max_iter=500) |
| clf.fit(X_train_vec, y_train) |
|
|
| |
| y_pred = clf.predict(X_test_vec) |
| print("Accuracy:", accuracy_score(y_test, y_pred)) |
| print("\nClassification Report:\n", classification_report(y_test, y_pred)) |
|
|
| |
| labels = clf.classes_ |
| cm = confusion_matrix(y_test, y_pred, labels=labels) |
| disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) |
| fig, ax = plt.subplots(figsize=(6, 5)) |
| disp.plot(ax=ax, cmap="Blues", values_format="d") |
| plt.show() |
|
|
| |
| from sklearn.pipeline import Pipeline |
| pipe = Pipeline([ |
| ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)), |
| ("clf", LogisticRegression(max_iter=500)) |
| ]) |
|
|
| scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy") |
| print("Cross-validation scores:", scores) |
| print("Mean CV Accuracy:", scores.mean()) |
|
|
| |
| train_sizes, train_scores, val_scores = learning_curve( |
| pipe, X, y, cv=5, scoring="accuracy", |
| train_sizes=np.linspace(0.1, 1.0, 5) |
| ) |
|
|
| train_mean = train_scores.mean(axis=1) |
| val_mean = val_scores.mean(axis=1) |
|
|
| plt.plot(train_sizes, train_mean, label="Training score") |
| plt.plot(train_sizes, val_mean, label="Validation score") |
| plt.xlabel("Training Set Size") |
| plt.ylabel("Accuracy") |
| plt.title("Learning Curve") |
| plt.legend() |
| plt.grid(True) |
| plt.show() |
|
|
| import spacy |
| from spacy.training.example import Example |
|
|
| |
| nlp = spacy.blank("en") |
|
|
| |
| textcat = nlp.add_pipe("textcat") |
| textcat.add_label("Garbage") |
| textcat.add_label("Water") |
| textcat.add_label("Roads") |
| textcat.add_label("Electricity") |
|
|
| |
| TRAIN_DATA = [] |
| for _, row in df.iterrows(): |
| text = row["Complaint Text"] |
| label = row["Category"] |
| cats = {cat: 0.0 for cat in textcat.labels} |
| cats[label] = 1.0 |
| TRAIN_DATA.append((text, {"cats": cats})) |
|
|
| |
| optimizer = nlp.begin_training() |
| for i in range(20): |
| losses = {} |
| for text, annotations in TRAIN_DATA: |
| doc = nlp.make_doc(text) |
| example = Example.from_dict(doc, annotations) |
| nlp.update([example], sgd=optimizer, losses=losses) |
| print(f"Epoch {i+1}, Losses: {losses}") |
|
|
| |
| nlp.to_disk("complaint_textcat_model") |
| print("โ
Text classification model saved: complaint_textcat_model") |
|
|
| import spacy |
| from spacy.training.example import Example |
| import random |
|
|
| |
| TRAIN_DATA = [] |
| for _, row in df.iterrows(): |
| text = row["Complaint Text"] |
| label = row["Category"] |
| cats = { |
| "Garbage": 0.0, |
| "Water": 0.0, |
| "Roads": 0.0, |
| "Electricity": 0.0 |
| } |
| cats[label] = 1.0 |
| TRAIN_DATA.append((text, {"cats": cats})) |
|
|
| |
| nlp = spacy.blank("en") |
| textcat = nlp.add_pipe("textcat") |
| for label in ["Garbage", "Water", "Roads", "Electricity"]: |
| textcat.add_label(label) |
|
|
| nlp.initialize() |
|
|
| |
| for itn in range(10): |
| random.shuffle(TRAIN_DATA) |
| losses = {} |
| for text, ann in TRAIN_DATA: |
| doc = nlp.make_doc(text) |
| example = Example.from_dict(doc, ann) |
| nlp.update([example], losses=losses) |
| print(f"Epoch {itn+1}, Losses: {losses}") |
|
|
| |
| def predict_complaint(text): |
| doc = nlp(text) |
|
|
| |
| cats = doc.cats |
| category = max(cats, key=cats.get) |
|
|
| |
| text_lower = text.lower() |
| urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
| medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
| priority = "Low" |
| if any(word in text_lower for word in urgent_words): |
| priority = "High" |
| elif any(word in text_lower for word in medium_words): |
| priority = "Medium" |
|
|
| return { |
| "Complaint": text, |
| "Predicted Category": category, |
| "Priority": priority |
| } |
|
|
| |
| print(predict_complaint("Debris dumped behind chandni chowk")) |
| print(predict_complaint("Streetlight not working near ChANdni chowk, its very dangerous")) |
|
|
| import pickle |
|
|
| |
| class ComplaintClassifier: |
| def __init__(self, nlp_model): |
| self.nlp = nlp_model |
|
|
| def predict(self, text): |
| doc = self.nlp(text) |
| cats = doc.cats |
| category = max(cats, key=cats.get) |
|
|
| |
| text_lower = text.lower() |
| urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
| medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
| priority = "Low" |
| if any(word in text_lower for word in urgent_words): |
| priority = "High" |
| elif any(word in text_lower for word in medium_words): |
| priority = "Medium" |
|
|
| return { |
| "Complaint": text, |
| "Predicted Category": category, |
| "Priority": priority |
| } |
|
|
| |
| classifier = ComplaintClassifier(nlp) |
|
|
| |
| with open("complaint_model.pkl", "wb") as f: |
| pickle.dump(classifier, f) |
|
|
| print("โ
complaint_model.pkl saved successfully") |
|
|
| from fastapi import FastAPI |
| from pydantic import BaseModel |
| import uvicorn |
| import nest_asyncio |
| import pickle |
| import spacy |
|
|
| |
| |
| with open("complaint_model.pkl", "rb") as f: |
| nlp = pickle.load(f) |
|
|
| |
| def detect_priority(text: str) -> str: |
| text_lower = text.lower() |
| urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
| medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
| if any(word in text_lower for word in urgent_words): |
| return "High" |
| elif any(word in text_lower for word in medium_words): |
| return "Medium" |
| return "Low" |
|
|
| |
| app = FastAPI() |
|
|
| class ComplaintInput(BaseModel): |
| text: str |
|
|
| @app.post("/predict") |
| async def predict_complaint(input_data: ComplaintInput): |
| doc = nlp(input_data.text) |
| cats = doc.cats |
| category = max(cats, key=cats.get) |
| priority = detect_priority(input_data.text) |
|
|
| return { |
| "Complaint": input_data.text, |
| "Predicted Category": category, |
| "Priority": priority, |
| "Raw Scores": cats |
| } |
|
|
| |
| if __name__ == "__main__": |
| try: |
| nest_asyncio.apply() |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
| except RuntimeError: |
| |
| pass |
|
|
|
|