afouda commited on
Commit
3330321
·
verified ·
1 Parent(s): 246d855

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +404 -0
app.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from __future__ import annotations
4
+ import os
5
+ import re
6
+ import typing as T
7
+ import numpy as np
8
+ import pandas as pd
9
+ from dataclasses import dataclass
10
+
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.cluster import KMeans
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ import gradio as gr
16
+
17
+
18
+ CANONICAL_DISCIPLINES = [
19
+ "Computer Engineering",
20
+ "Computer Science",
21
+ "Software Engineering",
22
+ "Information Systems",
23
+ "Data Science",
24
+ "Artificial Intelligence",
25
+ "Electrical Engineering",
26
+ "Electronics Engineering",
27
+ "Communication Engineering",
28
+ "Mechanical Engineering",
29
+ "Civil Engineering",
30
+ "Biomedical Engineering",
31
+ "Mechatronics",
32
+ "Chemical Engineering",
33
+ "Industrial Engineering",
34
+ "Architecture",
35
+ "Business Administration",
36
+ "Accounting",
37
+ "Marketing",
38
+ "Finance",
39
+ "Economics",
40
+ ]
41
+
42
+ # Keyword rules for direct mapping (Arabic + English). Order matters.
43
+ RULES: list[tuple[str, str]] = [
44
+ # AI / Data / CS
45
+ (r"\b(data\s*science|تحليل\s*البيانات|علم\s*البيانات)\b", "Data Science"),
46
+ (r"\b(artificial\s*intelligence|ذكاء\s*اصطناعي|ذكاء\s*إصطناعي|AI)\b", "Artificial Intelligence"),
47
+ (r"\b(software\s*engineering|هندسة\s*البرمجيات)\b", "Software Engineering"),
48
+ (r"\b(information\s*systems|نظم\s*المعلومات)\b", "Information Systems"),
49
+ (r"\b(computer\s*science|علوم?\s*الحاسوب|حاسبات|CS)\b", "Computer Science"),
50
+ (r"\b(computer\s*engineering|هندسة\s*الحاسبات|كمبيوتر)\b", "Computer Engineering"),
51
+ # EE / Comm / Electronics
52
+ (r"\b(communications?\s*engineering|اتصالات)\b", "Communication Engineering"),
53
+ (r"\b(electrical\s*engineering|كهرب(اء|ائية))\b", "Electrical Engineering"),
54
+ (r"\b(electronics?\s*engineering|إلكترونيات)\b", "Electronics Engineering"),
55
+ # Other engineering
56
+ (r"\b(mechatronics?|ميكاترونكس)\b", "Mechatronics"),
57
+ (r"\b(mechanical\s*engineering|ميكانيكا)\b", "Mechanical Engineering"),
58
+ (r"\b(civil\s*engineering|مدني)\b", "Civil Engineering"),
59
+ (r"\b(biomedical\s*engineering|هندسة\s*طبية)\b", "Biomedical Engineering"),
60
+ (r"\b(chemical\s*engineering|كيميائية)\b", "Chemical Engineering"),
61
+ (r"\b(industrial\s*engineering|انتاج|صناعية)\b", "Industrial Engineering"),
62
+ (r"\b(architecture|هندسة\s*معمارية|عمارة)\b", "Architecture"),
63
+ # Business
64
+ (r"\b(business\s*administration|ادارة\s*اعمال)\b", "Business Administration"),
65
+ (r"\b(accounting|محاسبة)\b", "Accounting"),
66
+ (r"\b(marketing|تسويق)\b", "Marketing"),
67
+ (r"\b(finance|تمويل)\b", "Finance"),
68
+ (r"\b(economics|اقتصاد)\b", "Economics"),
69
+ ]
70
+
71
+ STOPWORDS_AR = {
72
+ "جامعة", "كلية", "قسم", "تخصص", "مشروع", "مشاريع", "عن", "في", "من", "على", "و",
73
+ }
74
+
75
+ STOPWORDS_EN = {
76
+ 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and',
77
+ 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before',
78
+ 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'could',
79
+ 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn',
80
+ "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for',
81
+ 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have',
82
+ 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
83
+ 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it',
84
+ "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't",
85
+ 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'now',
86
+ 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours',
87
+ 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she',
88
+ "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such',
89
+ 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
90
+ 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too',
91
+ 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were',
92
+ 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom',
93
+ 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you',
94
+ "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
95
+ }
96
+
97
+ ## -------------------
98
+ ## Data Structures
99
+ ## -------------------
100
+
101
+ @dataclass
102
+ class Models:
103
+ vectorizer: TfidfVectorizer
104
+ kmeans: KMeans
105
+ canonical_matrix: np.ndarray # TF-IDF vectors for canonical labels
106
+
107
+ @dataclass
108
+ class AppState:
109
+ df: pd.DataFrame
110
+ models: Models
111
+ dep_dict: dict[str, list[str]]
112
+
113
+
114
+ def _normalize_text(s: str) -> str:
115
+ if not isinstance(s, str):
116
+ return ""
117
+ s = s.strip().lower()
118
+ s = re.sub(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]", "", s) # remove Arabic diacritics
119
+ s = re.sub(r"[\W_]+", " ", s)
120
+ words = s.split()
121
+ # Filter out stopwords from both Arabic and English sets
122
+ filtered_words = [word for word in words if word not in STOPWORDS_AR and word not in STOPWORDS_EN]
123
+ return " ".join(filtered_words)
124
+
125
+ def rule_based_map(text: str) -> str | None:
126
+ t = _normalize_text(text)
127
+ for pat, label in RULES:
128
+ if re.search(pat, t, flags=re.IGNORECASE):
129
+ return label
130
+ return None
131
+
132
+ def build_department_dict(df: pd.DataFrame) -> dict[str, list[str]]:
133
+ mapping: dict[str, list[str]] = {}
134
+ for uni, group in df.groupby("university"):
135
+ deps = (
136
+ group["department"].astype(str).fillna("")
137
+ .apply(lambda x: x.strip())
138
+ .replace("", np.nan)
139
+ .dropna()
140
+ .unique()
141
+ .tolist()
142
+ )
143
+ mapping[str(uni)] = sorted(list(set(deps)), key=lambda s: s.lower())
144
+ return mapping
145
+
146
+ def train_kmeans(df: pd.DataFrame, n_clusters: int | None = None) -> Models:
147
+ # Use combined text to better infer discipline
148
+ combo = (
149
+ df["department"].astype(str).fillna("") + " " +
150
+ df["description"].astype(str).fillna("") + " " +
151
+ df["keywords"].astype(str).fillna("")
152
+ ).apply(_normalize_text)
153
+
154
+ # If dataset is tiny set clusters to min(len(CANONICAL_DISCIPLINES), unique departments)
155
+ if n_clusters is None:
156
+ n_clusters = min(len(CANONICAL_DISCIPLINES), max(2, df['department'].nunique()))
157
+
158
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_df=0.9)
159
+ X = vectorizer.fit_transform(combo)
160
+
161
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
162
+ kmeans.fit(X)
163
+
164
+ # Build canonical label matrix to map clusters to closest discipline later
165
+ canonical_texts = [
166
+ _normalize_text(lbl) + " " + lbl.replace("Engineering", " Eng ")
167
+ for lbl in CANONICAL_DISCIPLINES
168
+ ]
169
+ canonical_matrix = vectorizer.transform(canonical_texts)
170
+
171
+ return Models(vectorizer=vectorizer, kmeans=kmeans, canonical_matrix=canonical_matrix)
172
+
173
+ def infer_discipline(text_fields: list[str], models: Models) -> str:
174
+ # Try rules first
175
+ for t in text_fields:
176
+ m = rule_based_map(t)
177
+ if m:
178
+ return m
179
+
180
+ # Fallback to KMeans + nearest canonical
181
+ merged = _normalize_text(" ".join([t for t in text_fields if isinstance(t, str)]))
182
+ if not merged.strip():
183
+ return "Unknown"
184
+
185
+ vec = models.vectorizer.transform([merged])
186
+ cluster_idx = models.kmeans.predict(vec)[0]
187
+ # Find canonical label closest to this vector
188
+ sims = cosine_similarity(vec, models.canonical_matrix)[0]
189
+ best_idx = int(np.argmax(sims))
190
+ return CANONICAL_DISCIPLINES[best_idx]
191
+
192
+ def add_discipline_column(df: pd.DataFrame, models: Models) -> pd.DataFrame:
193
+ texts = (
194
+ df[["department", "description", "keywords"]]
195
+ .astype(str)
196
+ .fillna("")
197
+ .values
198
+ .tolist()
199
+ )
200
+ labels = [infer_discipline(row, models) for row in texts]
201
+ out = df.copy()
202
+ out["discipline"] = labels
203
+ return out
204
+
205
+ def load_dataset(csv_file_path: str | None) -> pd.DataFrame:
206
+ if not csv_file_path or not os.path.exists(csv_file_path):
207
+ raise FileNotFoundError("CSV file not found. Please upload or set a valid path.")
208
+
209
+ df = pd.read_csv(csv_file_path)
210
+
211
+ # Check for expected columns, be flexible with case/spacing
212
+ required = ["title", "description", "keywords", "university", "faculty", "department"]
213
+ df.columns = [c.strip().lower() for c in df.columns] # Normalize column names
214
+
215
+ missing = [c for c in required if c not in df.columns]
216
+ if missing:
217
+ raise ValueError(f"CSV missing required columns: {missing}")
218
+
219
+ # Clean data
220
+ for c in required:
221
+ df[c] = df[c].astype(str).fillna("").str.strip()
222
+ return df
223
+
224
+ # Initialize from a default path if provided via env
225
+ DEFAULT_CSV = os.getenv("PROJECTS_CSV_PATH", "projects_100.csv")
226
+ _state: AppState | None = None
227
+
228
+ def _init_state(csv_path: str) -> AppState:
229
+ df = load_dataset(csv_path)
230
+ models = train_kmeans(df)
231
+ df_with_discipline = add_discipline_column(df, models)
232
+ dep_dict = build_department_dict(df_with_discipline)
233
+ return AppState(df=df_with_discipline, models=models, dep_dict=dep_dict)
234
+
235
+ def refresh_data(csv_file_obj):
236
+ """(Re)load CSV and rebuild models + dropdowns."""
237
+ global _state
238
+ if csv_file_obj is None:
239
+ return "Please upload a file.", gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dataset(headers=[], samples=[])
240
+
241
+ try:
242
+ csv_path = csv_file_obj.name
243
+ _state = _init_state(csv_path)
244
+ except Exception as e:
245
+ return f"Error: {e}", gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dataset(headers=[], samples=[])
246
+
247
+
248
+ universities = sorted(_state.dep_dict.keys())
249
+ first_uni = universities[0] if universities else None
250
+
251
+ deps = _state.dep_dict.get(first_uni, []) if first_uni else []
252
+ first_dep = deps[0] if deps else None
253
+
254
+ # Example preview dataset (first 5 rows)
255
+ preview = _state.df[["title", "university", "faculty", "department", "discipline"]].head(5)
256
+
257
+ return (
258
+ f"Loaded {len(_state.df)} projects.",
259
+ gr.Dropdown(choices=universities, value=first_uni),
260
+ gr.Dropdown(choices=deps, value=first_dep),
261
+ gr.Dataset(samples=preview.values.tolist(), headers=list(preview.columns))
262
+ )
263
+
264
+ def update_departments(university: str):
265
+ if not _state or not university:
266
+ return gr.Dropdown(choices=[], value=None)
267
+ deps = _state.dep_dict.get(university, [])
268
+ return gr.Dropdown(choices=deps, value=(deps[0] if deps else None))
269
+
270
+ def query_projects(university: str, department: str):
271
+ if not _state:
272
+ return "Please load a file first.", pd.DataFrame(), pd.DataFrame()
273
+
274
+ if not university or not department:
275
+ return "Please select a university and department.", pd.DataFrame(), pd.DataFrame()
276
+
277
+ # Determine the discipline of the chosen department
278
+ subset = _state.df[
279
+ (_state.df["university"].str.lower() == str(university).lower()) &
280
+ (_state.df["department"].str.lower() == str(department).lower())
281
+ ]
282
+
283
+ discipline = subset.iloc[0]["discipline"] if not subset.empty else infer_discipline([department], _state.models)
284
+
285
+ # Filter projects from the same university and discipline
286
+ same_uni = _state.df[
287
+ (_state.df["university"].str.lower() == str(university).lower()) &
288
+ (_state.df["discipline"] == discipline)
289
+ ]
290
+
291
+ # Filter projects from other universities but the same discipline
292
+ other_unis = _state.df[
293
+ (_state.df["university"].str.lower() != str(university).lower()) &
294
+ (_state.df["discipline"] == discipline)
295
+ ]
296
+
297
+ msg = f"Unified Discipline: **{discipline}**\n\nProjects from the same university: {len(same_uni)} | From other universities: {len(other_unis)}"
298
+
299
+ cols = ["title", "description", "keywords", "university", "faculty", "department", "discipline"]
300
+ return msg, same_uni[cols].reset_index(drop=True), other_unis[cols].reset_index(drop=True)
301
+
302
+ def classify_ad_hoc(university: str, faculty: str, department: str, title: str, description: str, keywords: str):
303
+ if not _state:
304
+ return "Please load a file first.", pd.DataFrame(), pd.DataFrame()
305
+
306
+ discipline = infer_discipline([department, description, keywords, title], _state.models)
307
+
308
+ # Find similar projects based on the inferred discipline
309
+ same_uni = _state.df[
310
+ (_state.df["university"].str.lower() == str(university).lower()) &
311
+ (_state.df["discipline"] == discipline)
312
+ ]
313
+
314
+ other_unis = _state.df[
315
+ (_state.df["university"].str.lower() != str(university).lower()) &
316
+ (_state.df["discipline"] == discipline)
317
+ ]
318
+
319
+ info = f"Your project was classified as: **{discipline}**"
320
+ cols = ["title", "description", "keywords", "university", "faculty", "department", "discipline"]
321
+ return info, same_uni[cols].reset_index(drop=True), other_unis[cols].reset_index(drop=True)
322
+
323
+ def build_app():
324
+ with gr.Blocks(title="University Project Discipline Classifier", theme=gr.themes.Soft()) as demo:
325
+ gr.Markdown("""
326
+ # 🔎 Classify Graduation Projects by **Unified Discipline**
327
+ **Upload a CSV file** with the required columns. After uploading, choose the university and department to view:
328
+ 1. Projects from the **same university** with the same unified discipline.
329
+ 2. Projects from **other universities** with the same discipline (thanks to clustering).
330
+ """)
331
+
332
+ with gr.Row():
333
+ csv_file = gr.File(label="Projects File (CSV)", file_types=[".csv"])
334
+ load_btn = gr.Button("Load / Reload Data")
335
+
336
+ status = gr.Markdown("No file loaded yet.")
337
+ preview = gr.Dataset(components=[], headers=[], samples=[], label="Data Preview (first 5 rows)")
338
+
339
+ with gr.Tab("Search by Discipline"):
340
+ with gr.Row():
341
+ uni_dd = gr.Dropdown(label="University", choices=[])
342
+ dep_dd = gr.Dropdown(label="Department / Specialization", choices=[])
343
+ search_btn = gr.Button("Search")
344
+
345
+ result_msg = gr.Markdown()
346
+ same_uni_tbl = gr.Dataframe(label="Projects from the Same University & Discipline", interactive=False)
347
+ other_unis_tbl = gr.Dataframe(label="Projects from Other Universities (Same Discipline)", interactive=False)
348
+
349
+ with gr.Tab("Classify a New Project"):
350
+ gr.Markdown("## Try Classifying a New Project (without saving)")
351
+ with gr.Row():
352
+ ah_uni = gr.Textbox(label="University")
353
+ ah_fac = gr.Textbox(label="Faculty")
354
+ ah_dep = gr.Textbox(label="Department / Specialization")
355
+ ah_title = gr.Textbox(label="Project Title")
356
+ ah_desc = gr.Textbox(label="Description", lines=3)
357
+ ah_keys = gr.Textbox(label="Keywords (comma-separated)", info="e.g., deep learning, Python, IoT")
358
+ classify_btn = gr.Button("Classify Project & Show Similar Projects")
359
+ info_box = gr.Markdown()
360
+
361
+ load_btn.click(
362
+ fn=refresh_data,
363
+ inputs=[csv_file],
364
+ outputs=[status, uni_dd, dep_dd, preview]
365
+ )
366
+
367
+ uni_dd.change(
368
+ fn=update_departments,
369
+ inputs=[uni_dd],
370
+ outputs=[dep_dd]
371
+ )
372
+
373
+ search_btn.click(
374
+ fn=query_projects,
375
+ inputs=[uni_dd, dep_dd],
376
+ outputs=[result_msg, same_uni_tbl, other_unis_tbl]
377
+ )
378
+
379
+ classify_btn.click(
380
+ fn=classify_ad_hoc,
381
+ inputs=[ah_uni, ah_fac, ah_dep, ah_title, ah_desc, ah_keys],
382
+ outputs=[info_box, same_uni_tbl, other_unis_tbl]
383
+ )
384
+
385
+ return demo
386
+
387
+
388
+ if __name__ == "__main__":
389
+ # Try to preload if a default CSV exists
390
+ try:
391
+ if os.path.exists(DEFAULT_CSV):
392
+ print(f"Loading default data from: {DEFAULT_CSV}")
393
+ _state = _init_state(DEFAULT_CSV)
394
+ print("Default data loaded successfully.")
395
+ else:
396
+ print(f"Default CSV '{DEFAULT_CSV}' not found. Please upload a file in the app.")
397
+ _state = None
398
+ except Exception as e:
399
+ print(f"Initial load failed: {e}")
400
+ _state = None
401
+
402
+ app = build_app()
403
+ # For local dev, set share=True if you want a public link
404
+ app.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))