limitedonly41 commited on
Commit
07440db
·
verified ·
1 Parent(s): 0dea594

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -66
app.py CHANGED
@@ -30,95 +30,112 @@ class WebsiteCategorizerApp:
30
  logger.error(f"Ошибка конвертации URL: {e}")
31
  return ""
32
 
33
- # def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]:
34
- # try:
35
- # if not sheet_url:
36
- # return "❌ Ошибка: Введите URL Google таблицы", ""
 
 
 
 
 
 
 
 
 
37
 
38
- # csv_url = self.convert_google_sheet_url(sheet_url)
39
- # if not csv_url:
40
- # return "❌ Ошибка: Неверный формат URL", ""
41
 
42
- # df = pd.read_csv(csv_url)
43
- # if df.empty:
44
- # return "❌ Ошибка: Таблица пуста", ""
45
 
46
- # if len(df.columns) < 2:
47
- # return "❌ Ошибка: Нужно минимум 2 столбца (URL и категория)", ""
 
48
 
49
- # self.sheet_data = []
50
- # self.results_data = []
51
 
52
- # url_column = df.columns[0]
53
- # category_column = df.columns[1]
 
 
54
 
55
- # for index, row in df.iterrows():
56
- # url = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
57
- # category = str(row[category_column]).strip() if pd.notna(row[category_column]) else ""
 
 
 
58
 
59
- # if url and url.lower() not in ['url', 'nan']:
60
- # self.sheet_data.append({
61
- # "index": index,
62
- # "url": url,
63
- # "category": category if category.lower() != 'nan' else ""
64
- # })
65
- # self.results_data.append({
66
- # "url": url,
67
- # "category": category if category.lower() != 'nan' else ""
68
- # })
69
 
70
- # if not self.sheet_data:
71
- # return "❌ Ошибка: Не найдены валидные URL", ""
 
 
 
 
 
 
 
 
 
72
 
73
- # self.current_index = 0
74
- # self.sheet_url = sheet_url
75
 
76
- # return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
 
77
 
78
- # except Exception as e:
79
- # logger.error(f"Ошибка подключения к таблице: {e}")
80
- # return f"❌ Ошибка: {str(e)}\n\nУбедитесь что таблица публичная и URL корректный", ""
81
 
82
  def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]:
83
  try:
84
  if not sheet_url:
85
  return "❌ Ошибка: Введите URL Google таблицы", ""
86
-
87
  csv_url = self.convert_google_sheet_url(sheet_url)
88
  if not csv_url:
89
  return "❌ Ошибка: Неверный формат URL", ""
90
-
91
  df = pd.read_csv(csv_url)
92
  if df.empty or df.shape[1] < 1:
93
  return "❌ Ошибка: Таблица пуста или нет данных", ""
94
-
95
  # Always use column A for URL/title
96
  url_column = df.columns[0]
97
-
98
  # Find category column (case-insensitive match for 'category')
99
  category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"]
100
  category_column = category_col_candidates[0] if category_col_candidates else None
101
-
102
  self.sheet_data = []
103
  self.results_data = []
104
-
105
  for index, row in df.iterrows():
106
  raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
107
  if not raw_value:
108
  continue
109
-
110
  # Detect if it's URL or title
111
  if raw_value.lower().startswith("http"):
112
  url = raw_value
113
  else:
114
  # treat as title → skip until we find an actual URL? (optional)
115
  url = ""
116
-
117
  # Category (if exists)
118
  category = ""
119
  if category_column and pd.notna(row[category_column]):
120
  category = str(row[category_column]).strip()
121
-
122
  # Only add if URL is valid
123
  if url:
124
  self.sheet_data.append({
@@ -130,19 +147,18 @@ class WebsiteCategorizerApp:
130
  "url": url,
131
  "category": category
132
  })
133
-
134
  if not self.sheet_data:
135
  return "❌ Ошибка: Не найдены валидные URL", ""
136
-
137
  self.current_index = 0
138
  self.sheet_url = sheet_url
139
  return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
140
-
141
  except Exception as e:
142
  logger.error(f"Ошибка подключения к таблице: {e}")
143
  return f"❌ Ошибка: {str(e)}", ""
144
 
145
-
146
  def get_current_url_for_display(self) -> str:
147
  if not self.sheet_data or self.current_index >= len(self.sheet_data):
148
  return ""
@@ -203,13 +219,36 @@ app = WebsiteCategorizerApp()
203
 
204
  with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.Soft()) as demo:
205
  gr.HTML("<h2 style='text-align:center;'>🌐 Категоризатор сайтов</h2>")
 
206
  with gr.Tabs():
207
  with gr.TabItem("Категоризация"):
208
  with gr.Row():
209
  with gr.Column(scale=1):
210
- sheet_url_input = gr.Textbox(label="URL Google таблицы", lines=2)
211
- connect_btn = gr.Button("🔗 Подключить", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  connection_status = gr.HTML("")
 
 
213
  with gr.Row():
214
  prev_btn = gr.Button("⬅️", elem_id="prev-btn")
215
  next_btn = gr.Button("➡️", elem_id="next-btn")
@@ -219,10 +258,11 @@ with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.
219
  save_status = gr.HTML("")
220
  export_btn = gr.Button("📥 Скачать CSV")
221
  export_file = gr.File(visible=False)
 
222
  with gr.Column(scale=5):
223
  website_viewer = gr.HTML("""
224
  <div style='height:900px;display:flex;align-items:center;justify-content:center;background:#eee;border-radius:8px;'>
225
- <p>Подключите Google таблицу</p>
226
  </div>
227
  """)
228
 
@@ -237,27 +277,55 @@ with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.
237
 
238
  csv_data = gr.State("")
239
 
240
- # def handle_connect(url):
241
- # status, iframe_url = app.connect_to_sheet(url)
242
- # if "✅" in status:
243
- # url_display, category, info = app.get_current_info()
244
- # iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
245
- # return status, iframe_html, url_display, category, info
246
- # else:
247
- # return status, website_viewer.value, "", "", ""
248
 
249
  def handle_connect(url):
250
  status, iframe_url = app.connect_to_sheet(url)
251
  if "✅" in status:
252
  url_display, category, info = app.get_current_info()
253
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  # dynamically merge categories from data
255
  all_categories = list(set(app.categories + [
256
  c for c in (row["category"] for row in app.results_data) if c
257
  ]))
258
-
259
  iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
260
-
261
  return (
262
  status,
263
  iframe_html,
@@ -274,7 +342,6 @@ with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.
274
  ""
275
  )
276
 
277
-
278
  def handle_navigation(direction):
279
  if direction == "next":
280
  url_display, category, info, iframe_url = app.next_record()
@@ -298,12 +365,25 @@ with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.
298
  def refresh_table():
299
  return pd.DataFrame(app.results_data)
300
 
 
 
 
 
 
 
 
301
  connect_btn.click(
302
  handle_connect,
303
  inputs=[sheet_url_input],
304
  outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info]
305
  )
306
 
 
 
 
 
 
 
307
  next_btn.click(lambda: handle_navigation("next"),
308
  outputs=[website_viewer, current_url_display, category_dropdown, record_info])
309
  prev_btn.click(lambda: handle_navigation("previous"),
@@ -329,4 +409,4 @@ with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.
329
  """)
330
 
331
  if __name__ == "__main__":
332
- demo.launch()
 
30
  logger.error(f"Ошибка конвертации URL: {e}")
31
  return ""
32
 
33
+ def load_file_data(self, file_path: str) -> Tuple[str, str]:
34
+ """Load data from uploaded CSV or Excel file"""
35
+ try:
36
+ if not file_path:
37
+ return "❌ Ошибка: Файл не выбран", ""
38
+
39
+ # Determine file type and read accordingly
40
+ if file_path.lower().endswith('.csv'):
41
+ df = pd.read_csv(file_path, encoding='utf-8')
42
+ elif file_path.lower().endswith(('.xlsx', '.xls')):
43
+ df = pd.read_excel(file_path)
44
+ else:
45
+ return "❌ Ошибка: Поддерживаются только CSV и Excel файлы", ""
46
 
47
+ if df.empty or df.shape[1] < 1:
48
+ return "❌ Ошибка: Файл пуст или нет данных", ""
 
49
 
50
+ # Always use column A for URL/title
51
+ url_column = df.columns[0]
 
52
 
53
+ # Find category column (case-insensitive match for 'category')
54
+ category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"]
55
+ category_column = category_col_candidates[0] if category_col_candidates else None
56
 
57
+ self.sheet_data = []
58
+ self.results_data = []
59
 
60
+ for index, row in df.iterrows():
61
+ raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
62
+ if not raw_value:
63
+ continue
64
 
65
+ # Detect if it's URL or title
66
+ if raw_value.lower().startswith("http"):
67
+ url = raw_value
68
+ else:
69
+ # treat as title → skip until we find an actual URL? (optional)
70
+ url = ""
71
 
72
+ # Category (if exists)
73
+ category = ""
74
+ if category_column and pd.notna(row[category_column]):
75
+ category = str(row[category_column]).strip()
 
 
 
 
 
 
76
 
77
+ # Only add if URL is valid
78
+ if url:
79
+ self.sheet_data.append({
80
+ "index": index,
81
+ "url": url,
82
+ "category": category
83
+ })
84
+ self.results_data.append({
85
+ "url": url,
86
+ "category": category
87
+ })
88
 
89
+ if not self.sheet_data:
90
+ return "❌ Ошибка: Не найдены валидные URL", ""
91
 
92
+ self.current_index = 0
93
+ return f"✅ Файл загружен успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
94
 
95
+ except Exception as e:
96
+ logger.error(f"Ошибка загрузки файла: {e}")
97
+ return f"❌ Ошибка: {str(e)}", ""
98
 
99
  def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]:
100
  try:
101
  if not sheet_url:
102
  return "❌ Ошибка: Введите URL Google таблицы", ""
103
+
104
  csv_url = self.convert_google_sheet_url(sheet_url)
105
  if not csv_url:
106
  return "❌ Ошибка: Неверный формат URL", ""
107
+
108
  df = pd.read_csv(csv_url)
109
  if df.empty or df.shape[1] < 1:
110
  return "❌ Ошибка: Таблица пуста или нет данных", ""
111
+
112
  # Always use column A for URL/title
113
  url_column = df.columns[0]
114
+
115
  # Find category column (case-insensitive match for 'category')
116
  category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"]
117
  category_column = category_col_candidates[0] if category_col_candidates else None
118
+
119
  self.sheet_data = []
120
  self.results_data = []
121
+
122
  for index, row in df.iterrows():
123
  raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
124
  if not raw_value:
125
  continue
126
+
127
  # Detect if it's URL or title
128
  if raw_value.lower().startswith("http"):
129
  url = raw_value
130
  else:
131
  # treat as title → skip until we find an actual URL? (optional)
132
  url = ""
133
+
134
  # Category (if exists)
135
  category = ""
136
  if category_column and pd.notna(row[category_column]):
137
  category = str(row[category_column]).strip()
138
+
139
  # Only add if URL is valid
140
  if url:
141
  self.sheet_data.append({
 
147
  "url": url,
148
  "category": category
149
  })
150
+
151
  if not self.sheet_data:
152
  return "❌ Ошибка: Не найдены валидные URL", ""
153
+
154
  self.current_index = 0
155
  self.sheet_url = sheet_url
156
  return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
157
+
158
  except Exception as e:
159
  logger.error(f"Ошибка подключения к таблице: {e}")
160
  return f"❌ Ошибка: {str(e)}", ""
161
 
 
162
  def get_current_url_for_display(self) -> str:
163
  if not self.sheet_data or self.current_index >= len(self.sheet_data):
164
  return ""
 
219
 
220
  with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.Soft()) as demo:
221
  gr.HTML("<h2 style='text-align:center;'>🌐 Категоризатор сайтов</h2>")
222
+
223
  with gr.Tabs():
224
  with gr.TabItem("Категоризация"):
225
  with gr.Row():
226
  with gr.Column(scale=1):
227
+ # Data source selection
228
+ gr.Markdown("### 📊 Источник данных")
229
+ data_source = gr.Radio(
230
+ choices=["Google Sheets", "Файл CSV/Excel"],
231
+ value="Google Sheets",
232
+ label="Выберите источник данных"
233
+ )
234
+
235
+ # Google Sheets section
236
+ with gr.Group(visible=True) as google_sheets_group:
237
+ sheet_url_input = gr.Textbox(label="URL Google таблицы", lines=2)
238
+ connect_btn = gr.Button("🔗 Подключить", variant="primary")
239
+
240
+ # File upload section
241
+ with gr.Group(visible=False) as file_upload_group:
242
+ file_input = gr.File(
243
+ label="Загрузить файл CSV или Excel",
244
+ file_types=[".csv", ".xlsx", ".xls"],
245
+ file_count="single"
246
+ )
247
+ load_file_btn = gr.Button("📁 Загрузить файл", variant="primary")
248
+
249
  connection_status = gr.HTML("")
250
+
251
+ # Navigation controls
252
  with gr.Row():
253
  prev_btn = gr.Button("⬅️", elem_id="prev-btn")
254
  next_btn = gr.Button("➡️", elem_id="next-btn")
 
258
  save_status = gr.HTML("")
259
  export_btn = gr.Button("📥 Скачать CSV")
260
  export_file = gr.File(visible=False)
261
+
262
  with gr.Column(scale=5):
263
  website_viewer = gr.HTML("""
264
  <div style='height:900px;display:flex;align-items:center;justify-content:center;background:#eee;border-radius:8px;'>
265
+ <p>Подключите источник данных</p>
266
  </div>
267
  """)
268
 
 
277
 
278
  csv_data = gr.State("")
279
 
280
+ def toggle_data_source(source):
281
+ if source == "Google Sheets":
282
+ return gr.update(visible=True), gr.update(visible=False)
283
+ else:
284
+ return gr.update(visible=False), gr.update(visible=True)
 
 
 
285
 
286
  def handle_connect(url):
287
  status, iframe_url = app.connect_to_sheet(url)
288
  if "✅" in status:
289
  url_display, category, info = app.get_current_info()
290
+
291
+ # dynamically merge categories from data
292
+ all_categories = list(set(app.categories + [
293
+ c for c in (row["category"] for row in app.results_data) if c
294
+ ]))
295
+
296
+ iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
297
+
298
+ return (
299
+ status,
300
+ iframe_html,
301
+ url_display,
302
+ gr.update(choices=all_categories, value=category),
303
+ info
304
+ )
305
+ else:
306
+ return (
307
+ status,
308
+ website_viewer.value,
309
+ "",
310
+ gr.update(choices=app.categories, value=None),
311
+ ""
312
+ )
313
+
314
+ def handle_file_upload(file):
315
+ if file is None:
316
+ return "❌ Ошибка: Файл не выбран", "", "", gr.update(choices=app.categories, value=None), ""
317
+
318
+ status, iframe_url = app.load_file_data(file.name)
319
+ if "✅" in status:
320
+ url_display, category, info = app.get_current_info()
321
+
322
  # dynamically merge categories from data
323
  all_categories = list(set(app.categories + [
324
  c for c in (row["category"] for row in app.results_data) if c
325
  ]))
326
+
327
  iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
328
+
329
  return (
330
  status,
331
  iframe_html,
 
342
  ""
343
  )
344
 
 
345
  def handle_navigation(direction):
346
  if direction == "next":
347
  url_display, category, info, iframe_url = app.next_record()
 
365
  def refresh_table():
366
  return pd.DataFrame(app.results_data)
367
 
368
+ # Event handlers
369
+ data_source.change(
370
+ toggle_data_source,
371
+ inputs=[data_source],
372
+ outputs=[google_sheets_group, file_upload_group]
373
+ )
374
+
375
  connect_btn.click(
376
  handle_connect,
377
  inputs=[sheet_url_input],
378
  outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info]
379
  )
380
 
381
+ load_file_btn.click(
382
+ handle_file_upload,
383
+ inputs=[file_input],
384
+ outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info]
385
+ )
386
+
387
  next_btn.click(lambda: handle_navigation("next"),
388
  outputs=[website_viewer, current_url_display, category_dropdown, record_info])
389
  prev_btn.click(lambda: handle_navigation("previous"),
 
409
  """)
410
 
411
  if __name__ == "__main__":
412
+ demo.launch()