Amii2410 commited on
Commit
ab8e415
·
verified ·
1 Parent(s): 465e861

Upload 3 files

Browse files
Files changed (3) hide show
  1. Spacy.txt +1 -0
  2. app (1).py +366 -0
  3. requirements.txt +8 -0
Spacy.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ spacy[transformers]
app (1).py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Bli_bGuux1CJr22uJYxsoLSQkr5LjXvD
8
+ """
9
+
10
+ import random
11
+ import pandas as pd
12
+
13
+ # Complaint categories with 10–12 synonym-rich templates each (no {} placeholders now)
14
+ categories = {
15
+ "Garbage": [
16
+ "Garbage not collected",
17
+ "Trash piled up",
18
+ "Waste scattered everywhere",
19
+ "Debris dumped carelessly",
20
+ "Rubbish overflowing",
21
+ "Litter causing bad smell",
22
+ "Uncollected scrap lying around",
23
+ "Filth spread all over",
24
+ "Junk thrown carelessly",
25
+ "Refuse dumped openly",
26
+ "Garbage heap blocking the way",
27
+ "Dumping ground overflowing"
28
+ ],
29
+ "Water": [
30
+ "Water pipeline leaking",
31
+ "No water supply",
32
+ "Contaminated tap water",
33
+ "Low water pressure",
34
+ "Water tanker not arrived",
35
+ "Sewage water overflow",
36
+ "Drainage issue",
37
+ "Sewer blockage reported",
38
+ "Flooding due to heavy rain",
39
+ "Water logging problem",
40
+ "Dirty water flowing",
41
+ "Burst pipeline issue"
42
+ ],
43
+ "Roads": [
44
+ "Big pothole on the road",
45
+ "Damaged road surface",
46
+ "Cracks on the road",
47
+ "Uneven surface making driving difficult",
48
+ "Broken speed breaker",
49
+ "Debris blocking the road",
50
+ "Manhole cover missing",
51
+ "Broken pavement",
52
+ "Damaged footpath",
53
+ "Road erosion reported",
54
+ "Construction waste dumped on road",
55
+ "Street blocked due to cave-in"
56
+ ],
57
+ "Electricity": [
58
+ # General electricity
59
+ "Frequent power cuts",
60
+ "Load shedding problem",
61
+ "Voltage fluctuation issue",
62
+ "Transformer not working",
63
+ "Wire hanging dangerously",
64
+ "No electricity supply",
65
+ "Complete blackout",
66
+ "Short circuit issue reported",
67
+ "Electrical failure in houses",
68
+ "Electric spark observed",
69
+ # Streetlight related
70
+ "Streetlight not working",
71
+ "Streetlight bulb fused",
72
+ "Dark area due to broken streetlight",
73
+ "Streetlight flickering",
74
+ "Streetlight pole damaged",
75
+ "Entire lane dark without lights"
76
+ ]
77
+ }
78
+
79
+ # Number of complaints per category (balanced dataset)
80
+ num_samples = 300 # per category
81
+ data = []
82
+
83
+ for category, templates in categories.items():
84
+ for _ in range(num_samples):
85
+ template = random.choice(templates)
86
+ data.append({
87
+ "Complaint Text": template,
88
+ "Category": category
89
+ })
90
+
91
+ # Convert to DataFrame
92
+ df = pd.DataFrame(data)
93
+
94
+ # Shuffle
95
+ df = df.sample(frac=1).reset_index(drop=True)
96
+
97
+ # Save CSV
98
+ df.to_csv("synthetic_civic_complaints_no_location.csv", index=False, encoding="utf-8")
99
+
100
+ print("✅ Final synonym-rich dataset created: synthetic_civic_complaints_no_location.csv")
101
+ display(df.head())
102
+
103
+ import pandas as pd
104
+ from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
105
+ from sklearn.feature_extraction.text import TfidfVectorizer
106
+ from sklearn.linear_model import LogisticRegression
107
+ from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
108
+ import matplotlib.pyplot as plt
109
+ import numpy as np
110
+
111
+ # 1. Load dataset
112
+ df = pd.read_csv("synthetic_civic_complaints_rich.csv")
113
+
114
+ # 🔹 Make all complaint text lowercase (case-insensitive)
115
+ df["Complaint Text"] = df["Complaint Text"].str.lower()
116
+
117
+ # 2. Train-test split
118
+ X = df["Complaint Text"]
119
+ y = df["Category"]
120
+
121
+ X_train, X_test, y_train, y_test = train_test_split(
122
+ X, y, test_size=0.2, random_state=42, stratify=y
123
+ )
124
+
125
+ # 3. Vectorizer + classifier
126
+ vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
127
+ X_train_vec = vectorizer.fit_transform(X_train)
128
+ X_test_vec = vectorizer.transform(X_test)
129
+
130
+ clf = LogisticRegression(max_iter=500)
131
+ clf.fit(X_train_vec, y_train)
132
+
133
+ # 4. Evaluate
134
+ y_pred = clf.predict(X_test_vec)
135
+ print("Accuracy:", accuracy_score(y_test, y_pred))
136
+ print("\nClassification Report:\n", classification_report(y_test, y_pred))
137
+
138
+ # 5. Confusion Matrix
139
+ labels = clf.classes_
140
+ cm = confusion_matrix(y_test, y_pred, labels=labels)
141
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
142
+ fig, ax = plt.subplots(figsize=(6, 5))
143
+ disp.plot(ax=ax, cmap="Blues", values_format="d")
144
+ plt.show()
145
+
146
+ # 6. Cross-validation
147
+ from sklearn.pipeline import Pipeline
148
+ pipe = Pipeline([
149
+ ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
150
+ ("clf", LogisticRegression(max_iter=500))
151
+ ])
152
+
153
+ scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
154
+ print("Cross-validation scores:", scores)
155
+ print("Mean CV Accuracy:", scores.mean())
156
+
157
+ # 7. Learning Curve
158
+ train_sizes, train_scores, val_scores = learning_curve(
159
+ pipe, X, y, cv=5, scoring="accuracy",
160
+ train_sizes=np.linspace(0.1, 1.0, 5)
161
+ )
162
+
163
+ train_mean = train_scores.mean(axis=1)
164
+ val_mean = val_scores.mean(axis=1)
165
+
166
+ plt.plot(train_sizes, train_mean, label="Training score")
167
+ plt.plot(train_sizes, val_mean, label="Validation score")
168
+ plt.xlabel("Training Set Size")
169
+ plt.ylabel("Accuracy")
170
+ plt.title("Learning Curve")
171
+ plt.legend()
172
+ plt.grid(True)
173
+ plt.show()
174
+
175
+ import spacy
176
+ from spacy.training.example import Example
177
+
178
+ # Create blank English pipeline
179
+ nlp = spacy.blank("en")
180
+
181
+ # Add text categorizer instead of NER
182
+ textcat = nlp.add_pipe("textcat")
183
+ textcat.add_label("Garbage")
184
+ textcat.add_label("Water")
185
+ textcat.add_label("Roads")
186
+ textcat.add_label("Electricity")
187
+
188
+ # Prepare training data
189
+ TRAIN_DATA = []
190
+ for _, row in df.iterrows():
191
+ text = row["Complaint Text"]
192
+ label = row["Category"]
193
+ cats = {cat: 0.0 for cat in textcat.labels}
194
+ cats[label] = 1.0
195
+ TRAIN_DATA.append((text, {"cats": cats}))
196
+
197
+ # Train the text classifier
198
+ optimizer = nlp.begin_training()
199
+ for i in range(20): # epochs
200
+ losses = {}
201
+ for text, annotations in TRAIN_DATA:
202
+ doc = nlp.make_doc(text)
203
+ example = Example.from_dict(doc, annotations)
204
+ nlp.update([example], sgd=optimizer, losses=losses)
205
+ print(f"Epoch {i+1}, Losses: {losses}")
206
+
207
+ # Save model
208
+ nlp.to_disk("complaint_textcat_model")
209
+ print("✅ Text classification model saved: complaint_textcat_model")
210
+
211
+ import spacy
212
+ from spacy.training.example import Example
213
+ import random
214
+
215
+ # 🔹 Build text classification training data
216
+ TRAIN_DATA = []
217
+ for _, row in df.iterrows():
218
+ text = row["Complaint Text"]
219
+ label = row["Category"]
220
+ cats = {
221
+ "Garbage": 0.0,
222
+ "Water": 0.0,
223
+ "Roads": 0.0,
224
+ "Electricity": 0.0
225
+ }
226
+ cats[label] = 1.0
227
+ TRAIN_DATA.append((text, {"cats": cats}))
228
+
229
+ # 🔹 Create blank pipeline with text categorizer
230
+ nlp = spacy.blank("en")
231
+ textcat = nlp.add_pipe("textcat")
232
+ for label in ["Garbage", "Water", "Roads", "Electricity"]:
233
+ textcat.add_label(label)
234
+
235
+ nlp.initialize()
236
+
237
+ # 🔹 Train model
238
+ for itn in range(10): # epochs
239
+ random.shuffle(TRAIN_DATA)
240
+ losses = {}
241
+ for text, ann in TRAIN_DATA:
242
+ doc = nlp.make_doc(text)
243
+ example = Example.from_dict(doc, ann)
244
+ nlp.update([example], losses=losses)
245
+ print(f"Epoch {itn+1}, Losses: {losses}")
246
+
247
+ # 🔹 Complaint prediction function
248
+ def predict_complaint(text):
249
+ doc = nlp(text)
250
+
251
+ # Step 1 → Category prediction
252
+ cats = doc.cats
253
+ category = max(cats, key=cats.get) # pick category with highest score
254
+
255
+ # Step 2 → Priority detection
256
+ text_lower = text.lower()
257
+ urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
258
+ medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]
259
+
260
+ priority = "Low"
261
+ if any(word in text_lower for word in urgent_words):
262
+ priority = "High"
263
+ elif any(word in text_lower for word in medium_words):
264
+ priority = "Medium"
265
+
266
+ return {
267
+ "Complaint": text,
268
+ "Predicted Category": category,
269
+ "Priority": priority
270
+ }
271
+
272
+ # 🔹 Test it
273
+ print(predict_complaint("Debris dumped behind chandni chowk"))
274
+ print(predict_complaint("Streetlight not working near ChANdni chowk, its very dangerous"))
275
+
276
+ import pickle
277
+
278
+ # Wrapper so spaCy model can be pickled
279
+ class ComplaintClassifier:
280
+ def __init__(self, nlp_model):
281
+ self.nlp = nlp_model
282
+
283
+ def predict(self, text):
284
+ doc = self.nlp(text)
285
+ cats = doc.cats
286
+ category = max(cats, key=cats.get)
287
+
288
+ # Priority detection
289
+ text_lower = text.lower()
290
+ urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
291
+ medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]
292
+
293
+ priority = "Low"
294
+ if any(word in text_lower for word in urgent_words):
295
+ priority = "High"
296
+ elif any(word in text_lower for word in medium_words):
297
+ priority = "Medium"
298
+
299
+ return {
300
+ "Complaint": text,
301
+ "Predicted Category": category,
302
+ "Priority": priority
303
+ }
304
+
305
+ # Wrap trained spaCy model
306
+ classifier = ComplaintClassifier(nlp)
307
+
308
+ # Save with pickle
309
+ with open("complaint_model.pkl", "wb") as f:
310
+ pickle.dump(classifier, f)
311
+
312
+ print("✅ complaint_model.pkl saved successfully")
313
+
314
+ from fastapi import FastAPI
315
+ from pydantic import BaseModel
316
+ import uvicorn
317
+ import nest_asyncio
318
+ import pickle
319
+ import spacy
320
+
321
+ # ========== Load trained model ==========
322
+ # Make sure you have already trained & saved it as complaint_model.pkl
323
+ with open("complaint_model.pkl", "rb") as f:
324
+ nlp = pickle.load(f)
325
+
326
+ # ========== Priority detection ==========
327
+ def detect_priority(text: str) -> str:
328
+ text_lower = text.lower()
329
+ urgent_words = ["urgent", "dangerous", "immediately", "accident", "severe"]
330
+ medium_words = ["not working", "overflow", "leak", "delay", "low pressure"]
331
+
332
+ if any(word in text_lower for word in urgent_words):
333
+ return "High"
334
+ elif any(word in text_lower for word in medium_words):
335
+ return "Medium"
336
+ return "Low"
337
+
338
+ # ========== FastAPI ==========
339
+ app = FastAPI()
340
+
341
+ class ComplaintInput(BaseModel):
342
+ text: str
343
+
344
+ @app.post("/predict")
345
+ async def predict_complaint(input_data: ComplaintInput):
346
+ doc = nlp(input_data.text)
347
+ cats = doc.cats
348
+ category = max(cats, key=cats.get)
349
+ priority = detect_priority(input_data.text)
350
+
351
+ return {
352
+ "Complaint": input_data.text,
353
+ "Predicted Category": category,
354
+ "Priority": priority,
355
+ "Raw Scores": cats
356
+ }
357
+
358
+ # ========== Run in Colab only ==========
359
+ if __name__ == "__main__":
360
+ try:
361
+ nest_asyncio.apply()
362
+ uvicorn.run(app, host="0.0.0.0", port=7860)
363
+ except RuntimeError:
364
+ # In Hugging Face or when uvicorn is auto-run, we skip this
365
+ pass
366
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ scikit-learn
4
+ pandas
5
+ numpy
6
+ matplotlib
7
+ spacy
8
+ textblob