|
|
|
|
|
"""app.ipynb |
|
|
|
|
|
Automatically generated by Colab. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1Bli_bGuux1CJr22uJYxsoLSQkr5LjXvD |
|
|
""" |
|
|
|
|
|
import random |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
categories = { |
|
|
"Garbage": [ |
|
|
"Garbage not collected", |
|
|
"Trash piled up", |
|
|
"Waste scattered everywhere", |
|
|
"Debris dumped carelessly", |
|
|
"Rubbish overflowing", |
|
|
"Litter causing bad smell", |
|
|
"Uncollected scrap lying around", |
|
|
"Filth spread all over", |
|
|
"Junk thrown carelessly", |
|
|
"Refuse dumped openly", |
|
|
"Garbage heap blocking the way", |
|
|
"Dumping ground overflowing" |
|
|
], |
|
|
"Water": [ |
|
|
"Water pipeline leaking", |
|
|
"No water supply", |
|
|
"Contaminated tap water", |
|
|
"Low water pressure", |
|
|
"Water tanker not arrived", |
|
|
"Sewage water overflow", |
|
|
"Drainage issue", |
|
|
"Sewer blockage reported", |
|
|
"Flooding due to heavy rain", |
|
|
"Water logging problem", |
|
|
"Dirty water flowing", |
|
|
"Burst pipeline issue" |
|
|
], |
|
|
"Roads": [ |
|
|
"Big pothole on the road", |
|
|
"Damaged road surface", |
|
|
"Cracks on the road", |
|
|
"Uneven surface making driving difficult", |
|
|
"Broken speed breaker", |
|
|
"Debris blocking the road", |
|
|
"Manhole cover missing", |
|
|
"Broken pavement", |
|
|
"Damaged footpath", |
|
|
"Road erosion reported", |
|
|
"Construction waste dumped on road", |
|
|
"Street blocked due to cave-in" |
|
|
], |
|
|
"Electricity": [ |
|
|
|
|
|
"Frequent power cuts", |
|
|
"Load shedding problem", |
|
|
"Voltage fluctuation issue", |
|
|
"Transformer not working", |
|
|
"Wire hanging dangerously", |
|
|
"No electricity supply", |
|
|
"Complete blackout", |
|
|
"Short circuit issue reported", |
|
|
"Electrical failure in houses", |
|
|
"Electric spark observed", |
|
|
|
|
|
"Streetlight not working", |
|
|
"Streetlight bulb fused", |
|
|
"Dark area due to broken streetlight", |
|
|
"Streetlight flickering", |
|
|
"Streetlight pole damaged", |
|
|
"Entire lane dark without lights" |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
num_samples = 300 |
|
|
data = [] |
|
|
|
|
|
for category, templates in categories.items(): |
|
|
for _ in range(num_samples): |
|
|
template = random.choice(templates) |
|
|
data.append({ |
|
|
"Complaint Text": template, |
|
|
"Category": category |
|
|
}) |
|
|
|
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
df = df.sample(frac=1).reset_index(drop=True) |
|
|
|
|
|
|
|
|
df.to_csv("synthetic_civic_complaints_no_location.csv", index=False, encoding="utf-8") |
|
|
|
|
|
print("โ
Final synonym-rich dataset created: synthetic_civic_complaints_no_location.csv") |
|
|
display(df.head()) |
|
|
|
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay |
|
|
import matplotlib.pyplot as plt |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
df = pd.read_csv("synthetic_civic_complaints_rich.csv") |
|
|
|
|
|
|
|
|
df["Complaint Text"] = df["Complaint Text"].str.lower() |
|
|
|
|
|
|
|
|
X = df["Complaint Text"] |
|
|
y = df["Category"] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000) |
|
|
X_train_vec = vectorizer.fit_transform(X_train) |
|
|
X_test_vec = vectorizer.transform(X_test) |
|
|
|
|
|
clf = LogisticRegression(max_iter=500) |
|
|
clf.fit(X_train_vec, y_train) |
|
|
|
|
|
|
|
|
y_pred = clf.predict(X_test_vec) |
|
|
print("Accuracy:", accuracy_score(y_test, y_pred)) |
|
|
print("\nClassification Report:\n", classification_report(y_test, y_pred)) |
|
|
|
|
|
|
|
|
labels = clf.classes_ |
|
|
cm = confusion_matrix(y_test, y_pred, labels=labels) |
|
|
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) |
|
|
fig, ax = plt.subplots(figsize=(6, 5)) |
|
|
disp.plot(ax=ax, cmap="Blues", values_format="d") |
|
|
plt.show() |
|
|
|
|
|
|
|
|
from sklearn.pipeline import Pipeline |
|
|
pipe = Pipeline([ |
|
|
("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)), |
|
|
("clf", LogisticRegression(max_iter=500)) |
|
|
]) |
|
|
|
|
|
scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy") |
|
|
print("Cross-validation scores:", scores) |
|
|
print("Mean CV Accuracy:", scores.mean()) |
|
|
|
|
|
|
|
|
train_sizes, train_scores, val_scores = learning_curve( |
|
|
pipe, X, y, cv=5, scoring="accuracy", |
|
|
train_sizes=np.linspace(0.1, 1.0, 5) |
|
|
) |
|
|
|
|
|
train_mean = train_scores.mean(axis=1) |
|
|
val_mean = val_scores.mean(axis=1) |
|
|
|
|
|
plt.plot(train_sizes, train_mean, label="Training score") |
|
|
plt.plot(train_sizes, val_mean, label="Validation score") |
|
|
plt.xlabel("Training Set Size") |
|
|
plt.ylabel("Accuracy") |
|
|
plt.title("Learning Curve") |
|
|
plt.legend() |
|
|
plt.grid(True) |
|
|
plt.show() |
|
|
|
|
|
import spacy |
|
|
from spacy.training.example import Example |
|
|
|
|
|
|
|
|
nlp = spacy.blank("en") |
|
|
|
|
|
|
|
|
textcat = nlp.add_pipe("textcat") |
|
|
textcat.add_label("Garbage") |
|
|
textcat.add_label("Water") |
|
|
textcat.add_label("Roads") |
|
|
textcat.add_label("Electricity") |
|
|
|
|
|
|
|
|
TRAIN_DATA = [] |
|
|
for _, row in df.iterrows(): |
|
|
text = row["Complaint Text"] |
|
|
label = row["Category"] |
|
|
cats = {cat: 0.0 for cat in textcat.labels} |
|
|
cats[label] = 1.0 |
|
|
TRAIN_DATA.append((text, {"cats": cats})) |
|
|
|
|
|
|
|
|
optimizer = nlp.begin_training() |
|
|
for i in range(20): |
|
|
losses = {} |
|
|
for text, annotations in TRAIN_DATA: |
|
|
doc = nlp.make_doc(text) |
|
|
example = Example.from_dict(doc, annotations) |
|
|
nlp.update([example], sgd=optimizer, losses=losses) |
|
|
print(f"Epoch {i+1}, Losses: {losses}") |
|
|
|
|
|
|
|
|
nlp.to_disk("complaint_textcat_model") |
|
|
print("โ
Text classification model saved: complaint_textcat_model") |
|
|
|
|
|
import spacy |
|
|
from spacy.training.example import Example |
|
|
import random |
|
|
|
|
|
|
|
|
TRAIN_DATA = [] |
|
|
for _, row in df.iterrows(): |
|
|
text = row["Complaint Text"] |
|
|
label = row["Category"] |
|
|
cats = { |
|
|
"Garbage": 0.0, |
|
|
"Water": 0.0, |
|
|
"Roads": 0.0, |
|
|
"Electricity": 0.0 |
|
|
} |
|
|
cats[label] = 1.0 |
|
|
TRAIN_DATA.append((text, {"cats": cats})) |
|
|
|
|
|
|
|
|
nlp = spacy.blank("en") |
|
|
textcat = nlp.add_pipe("textcat") |
|
|
for label in ["Garbage", "Water", "Roads", "Electricity"]: |
|
|
textcat.add_label(label) |
|
|
|
|
|
nlp.initialize() |
|
|
|
|
|
|
|
|
for itn in range(10): |
|
|
random.shuffle(TRAIN_DATA) |
|
|
losses = {} |
|
|
for text, ann in TRAIN_DATA: |
|
|
doc = nlp.make_doc(text) |
|
|
example = Example.from_dict(doc, ann) |
|
|
nlp.update([example], losses=losses) |
|
|
print(f"Epoch {itn+1}, Losses: {losses}") |
|
|
|
|
|
|
|
|
def predict_complaint(text): |
|
|
doc = nlp(text) |
|
|
|
|
|
|
|
|
cats = doc.cats |
|
|
category = max(cats, key=cats.get) |
|
|
|
|
|
|
|
|
text_lower = text.lower() |
|
|
urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
|
|
medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
|
|
|
priority = "Low" |
|
|
if any(word in text_lower for word in urgent_words): |
|
|
priority = "High" |
|
|
elif any(word in text_lower for word in medium_words): |
|
|
priority = "Medium" |
|
|
|
|
|
return { |
|
|
"Complaint": text, |
|
|
"Predicted Category": category, |
|
|
"Priority": priority |
|
|
} |
|
|
|
|
|
|
|
|
print(predict_complaint("Debris dumped behind chandni chowk")) |
|
|
print(predict_complaint("Streetlight not working near ChANdni chowk, its very dangerous")) |
|
|
|
|
|
import pickle |
|
|
|
|
|
|
|
|
class ComplaintClassifier: |
|
|
def __init__(self, nlp_model): |
|
|
self.nlp = nlp_model |
|
|
|
|
|
def predict(self, text): |
|
|
doc = self.nlp(text) |
|
|
cats = doc.cats |
|
|
category = max(cats, key=cats.get) |
|
|
|
|
|
|
|
|
text_lower = text.lower() |
|
|
urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
|
|
medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
|
|
|
priority = "Low" |
|
|
if any(word in text_lower for word in urgent_words): |
|
|
priority = "High" |
|
|
elif any(word in text_lower for word in medium_words): |
|
|
priority = "Medium" |
|
|
|
|
|
return { |
|
|
"Complaint": text, |
|
|
"Predicted Category": category, |
|
|
"Priority": priority |
|
|
} |
|
|
|
|
|
|
|
|
classifier = ComplaintClassifier(nlp) |
|
|
|
|
|
|
|
|
with open("complaint_model.pkl", "wb") as f: |
|
|
pickle.dump(classifier, f) |
|
|
|
|
|
print("โ
complaint_model.pkl saved successfully") |
|
|
|
|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
import uvicorn |
|
|
import nest_asyncio |
|
|
import pickle |
|
|
import spacy |
|
|
|
|
|
|
|
|
|
|
|
with open("complaint_model.pkl", "rb") as f: |
|
|
nlp = pickle.load(f) |
|
|
|
|
|
|
|
|
def detect_priority(text: str) -> str: |
|
|
text_lower = text.lower() |
|
|
urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"] |
|
|
medium_words = ["not working", "overflow", "leak", "delay", "low pressure"] |
|
|
|
|
|
if any(word in text_lower for word in urgent_words): |
|
|
return "High" |
|
|
elif any(word in text_lower for word in medium_words): |
|
|
return "Medium" |
|
|
return "Low" |
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
class ComplaintInput(BaseModel): |
|
|
text: str |
|
|
|
|
|
@app.post("/predict") |
|
|
async def predict_complaint(input_data: ComplaintInput): |
|
|
doc = nlp(input_data.text) |
|
|
cats = doc.cats |
|
|
category = max(cats, key=cats.get) |
|
|
priority = detect_priority(input_data.text) |
|
|
|
|
|
return { |
|
|
"Complaint": input_data.text, |
|
|
"Predicted Category": category, |
|
|
"Priority": priority, |
|
|
"Raw Scores": cats |
|
|
} |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
nest_asyncio.apply() |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
except RuntimeError: |
|
|
|
|
|
pass |
|
|
|
|
|
|