api / app.py

Rename app .py to app.py

e82f2c8 verified 7 months ago

10.6 kB

	# -- coding: utf-8 --
	"""app.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1Bli_bGuux1CJr22uJYxsoLSQkr5LjXvD
	"""

	import random
	import pandas as pd

	# Complaint categories with 10–12 synonym-rich templates each (no {} placeholders now)
	categories = {
	"Garbage": [
	"Garbage not collected",
	"Trash piled up",
	"Waste scattered everywhere",
	"Debris dumped carelessly",
	"Rubbish overflowing",
	"Litter causing bad smell",
	"Uncollected scrap lying around",
	"Filth spread all over",
	"Junk thrown carelessly",
	"Refuse dumped openly",
	"Garbage heap blocking the way",
	"Dumping ground overflowing"
	],
	"Water": [
	"Water pipeline leaking",
	"No water supply",
	"Contaminated tap water",
	"Low water pressure",
	"Water tanker not arrived",
	"Sewage water overflow",
	"Drainage issue",
	"Sewer blockage reported",
	"Flooding due to heavy rain",
	"Water logging problem",
	"Dirty water flowing",
	"Burst pipeline issue"
	],
	"Roads": [
	"Big pothole on the road",
	"Damaged road surface",
	"Cracks on the road",
	"Uneven surface making driving difficult",
	"Broken speed breaker",
	"Debris blocking the road",
	"Manhole cover missing",
	"Broken pavement",
	"Damaged footpath",
	"Road erosion reported",
	"Construction waste dumped on road",
	"Street blocked due to cave-in"
	],
	"Electricity": [
	# General electricity
	"Frequent power cuts",
	"Load shedding problem",
	"Voltage fluctuation issue",
	"Transformer not working",
	"Wire hanging dangerously",
	"No electricity supply",
	"Complete blackout",
	"Short circuit issue reported",
	"Electrical failure in houses",
	"Electric spark observed",
	# Streetlight related
	"Streetlight not working",
	"Streetlight bulb fused",
	"Dark area due to broken streetlight",
	"Streetlight flickering",
	"Streetlight pole damaged",
	"Entire lane dark without lights"
	]
	}

	# Number of complaints per category (balanced dataset)
	num_samples = 300 # per category
	data = []

	for category, templates in categories.items():
	for _ in range(num_samples):
	template = random.choice(templates)
	data.append({
	"Complaint Text": template,
	"Category": category
	})

	# Convert to DataFrame
	df = pd.DataFrame(data)

	# Shuffle
	df = df.sample(frac=1).reset_index(drop=True)

	# Save CSV
	df.to_csv("synthetic_civic_complaints_no_location.csv", index=False, encoding="utf-8")

	print("✅ Final synonym-rich dataset created: synthetic_civic_complaints_no_location.csv")
	display(df.head())

	import pandas as pd
	from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
	import matplotlib.pyplot as plt
	import numpy as np

	# 1. Load dataset
	df = pd.read_csv("synthetic_civic_complaints_rich.csv")

	# 🔹 Make all complaint text lowercase (case-insensitive)
	df["Complaint Text"] = df["Complaint Text"].str.lower()

	# 2. Train-test split
	X = df["Complaint Text"]
	y = df["Category"]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	# 3. Vectorizer + classifier
	vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
	X_train_vec = vectorizer.fit_transform(X_train)
	X_test_vec = vectorizer.transform(X_test)

	clf = LogisticRegression(max_iter=500)
	clf.fit(X_train_vec, y_train)

	# 4. Evaluate
	y_pred = clf.predict(X_test_vec)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("\nClassification Report:\n", classification_report(y_test, y_pred))

	# 5. Confusion Matrix
	labels = clf.classes_
	cm = confusion_matrix(y_test, y_pred, labels=labels)
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
	fig, ax = plt.subplots(figsize=(6, 5))
	disp.plot(ax=ax, cmap="Blues", values_format="d")
	plt.show()

	# 6. Cross-validation
	from sklearn.pipeline import Pipeline
	pipe = Pipeline([
	("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
	("clf", LogisticRegression(max_iter=500))
	])

	scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
	print("Cross-validation scores:", scores)
	print("Mean CV Accuracy:", scores.mean())

	# 7. Learning Curve
	train_sizes, train_scores, val_scores = learning_curve(
	pipe, X, y, cv=5, scoring="accuracy",
	train_sizes=np.linspace(0.1, 1.0, 5)
	)

	train_mean = train_scores.mean(axis=1)
	val_mean = val_scores.mean(axis=1)

	plt.plot(train_sizes, train_mean, label="Training score")
	plt.plot(train_sizes, val_mean, label="Validation score")
	plt.xlabel("Training Set Size")
	plt.ylabel("Accuracy")
	plt.title("Learning Curve")
	plt.legend()
	plt.grid(True)
	plt.show()

	import spacy
	from spacy.training.example import Example

	# Create blank English pipeline
	nlp = spacy.blank("en")

	# Add text categorizer instead of NER
	textcat = nlp.add_pipe("textcat")
	textcat.add_label("Garbage")
	textcat.add_label("Water")
	textcat.add_label("Roads")
	textcat.add_label("Electricity")

	# Prepare training data
	TRAIN_DATA = []
	for _, row in df.iterrows():
	text = row["Complaint Text"]
	label = row["Category"]
	cats = {cat: 0.0 for cat in textcat.labels}
	cats[label] = 1.0
	TRAIN_DATA.append((text, {"cats": cats}))

	# Train the text classifier
	optimizer = nlp.begin_training()
	for i in range(20): # epochs
	losses = {}
	for text, annotations in TRAIN_DATA:
	doc = nlp.make_doc(text)
	example = Example.from_dict(doc, annotations)
	nlp.update([example], sgd=optimizer, losses=losses)
	print(f"Epoch {i+1}, Losses: {losses}")

	# Save model
	nlp.to_disk("complaint_textcat_model")
	print("✅ Text classification model saved: complaint_textcat_model")

	import spacy
	from spacy.training.example import Example
	import random

	# 🔹 Build text classification training data
	TRAIN_DATA = []
	for _, row in df.iterrows():
	text = row["Complaint Text"]
	label = row["Category"]
	cats = {
	"Garbage": 0.0,
	"Water": 0.0,
	"Roads": 0.0,
	"Electricity": 0.0
	}
	cats[label] = 1.0
	TRAIN_DATA.append((text, {"cats": cats}))

	# 🔹 Create blank pipeline with text categorizer
	nlp = spacy.blank("en")
	textcat = nlp.add_pipe("textcat")
	for label in ["Garbage", "Water", "Roads", "Electricity"]:
	textcat.add_label(label)

	nlp.initialize()

	# 🔹 Train model
	for itn in range(10): # epochs
	random.shuffle(TRAIN_DATA)
	losses = {}
	for text, ann in TRAIN_DATA:
	doc = nlp.make_doc(text)
	example = Example.from_dict(doc, ann)
	nlp.update([example], losses=losses)
	print(f"Epoch {itn+1}, Losses: {losses}")

	# 🔹 Complaint prediction function
	def predict_complaint(text):
	doc = nlp(text)

	# Step 1 → Category prediction
	cats = doc.cats
	category = max(cats, key=cats.get) # pick category with highest score

	# Step 2 → Priority detection
	text_lower = text.lower()
	urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
	medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]

	priority = "Low"
	if any(word in text_lower for word in urgent_words):
	priority = "High"
	elif any(word in text_lower for word in medium_words):
	priority = "Medium"

	return {
	"Complaint": text,
	"Predicted Category": category,
	"Priority": priority
	}

	# 🔹 Test it
	print(predict_complaint("Debris dumped behind chandni chowk"))
	print(predict_complaint("Streetlight not working near ChANdni chowk, its very dangerous"))

	import pickle

	# Wrapper so spaCy model can be pickled
	class ComplaintClassifier:
	def __init__(self, nlp_model):
	self.nlp = nlp_model

	def predict(self, text):
	doc = self.nlp(text)
	cats = doc.cats
	category = max(cats, key=cats.get)

	# Priority detection
	text_lower = text.lower()
	urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
	medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]

	priority = "Low"
	if any(word in text_lower for word in urgent_words):
	priority = "High"
	elif any(word in text_lower for word in medium_words):
	priority = "Medium"

	return {
	"Complaint": text,
	"Predicted Category": category,
	"Priority": priority
	}

	# Wrap trained spaCy model
	classifier = ComplaintClassifier(nlp)

	# Save with pickle
	with open("complaint_model.pkl", "wb") as f:
	pickle.dump(classifier, f)

	print("✅ complaint_model.pkl saved successfully")

	from fastapi import FastAPI
	from pydantic import BaseModel
	import uvicorn
	import nest_asyncio
	import pickle
	import spacy

	# ========== Load trained model ==========
	# Make sure you have already trained & saved it as complaint_model.pkl
	with open("complaint_model.pkl", "rb") as f:
	nlp = pickle.load(f)

	# ========== Priority detection ==========
	def detect_priority(text: str) -> str:
	text_lower = text.lower()
	urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
	medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]

	if any(word in text_lower for word in urgent_words):
	return "High"
	elif any(word in text_lower for word in medium_words):
	return "Medium"
	return "Low"

	# ========== FastAPI ==========
	app = FastAPI()

	class ComplaintInput(BaseModel):
	text: str

	@app.post("/predict")
	async def predict_complaint(input_data: ComplaintInput):
	doc = nlp(input_data.text)
	cats = doc.cats
	category = max(cats, key=cats.get)
	priority = detect_priority(input_data.text)

	return {
	"Complaint": input_data.text,
	"Predicted Category": category,
	"Priority": priority,
	"Raw Scores": cats
	}

	# ========== Run in Colab only ==========
	if __name__ == "__main__":
	try:
	nest_asyncio.apply()
	uvicorn.run(app, host="0.0.0.0", port=7860)
	except RuntimeError:
	# In Hugging Face or when uvicorn is auto-run, we skip this
	pass