wuhp commited on
Commit
27f233b
·
verified ·
1 Parent(s): 6fa2207

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -19
app.py CHANGED
@@ -13,6 +13,15 @@ import torch
13
  from torch.utils.data import Dataset, DataLoader
14
  from sklearn.model_selection import train_test_split
15
  import numpy as np
 
 
 
 
 
 
 
 
 
16
 
17
  # --- CONFIGURATION ---
18
  DATASET_DIR = "dataset_ml_final_v2"
@@ -26,20 +35,17 @@ MODEL = None
26
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
 
28
  try:
29
- # ---------------------------------------------------------
30
- # FIX APPLIED HERE:
31
- # 1. Removed AdamW from transformers import
32
- # 2. Added AdamW from torch.optim
33
- # ---------------------------------------------------------
34
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
35
  from torch.optim import AdamW
36
 
 
 
 
37
  print("Attempting to load Longformer Tokenizer...")
38
  TOKENIZER = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
39
  print("✅ Tokenizer loaded successfully.")
40
  except Exception as e:
41
  print(f"⚠️ Tokenizer loading error: {e}")
42
- # Fallback for debugging if transformers fails entirely
43
  AdamW = None
44
 
45
  # --- ERAS (10 Distinct Periods) ---
@@ -154,7 +160,7 @@ def download_book(identifier, title, year, era_label, min_char_limit=5000):
154
  content = ""
155
  for url in urls:
156
  try:
157
- r = requests.get(url, timeout=10)
158
  if r.status_code == 200:
159
  content = r.text
160
  break
@@ -241,16 +247,25 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
241
  print(f" > Standard Search #{attempts}: {topic}")
242
 
243
  try:
244
- search = internetarchive.search_items(
245
  query,
246
  sorts=['downloads desc'],
247
  fields=['identifier', 'title', 'date', 'year']
248
  )
249
 
250
- results_found = 0
251
- for res in search:
 
 
 
 
 
 
 
 
 
 
252
  if collected >= books_per_era: break
253
- results_found += 1
254
 
255
  id_ = res.get('identifier')
256
  raw_date = res.get('date') or res.get('year')
@@ -265,6 +280,7 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
265
  if any(r['filename'].endswith(f"{id_}.txt") for r in records):
266
  continue
267
 
 
268
  rec = download_book(id_, res.get('title', 'Unknown'), year, era_label, min_char_limit=min_chars)
269
  if rec:
270
  rec['topic'] = "Classic" if using_rescue else topic
@@ -272,9 +288,6 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
272
  collected += 1
273
  print(f" ✅ Saved ({collected}/{books_per_era}): {rec['title']} ({year})")
274
 
275
- if results_found >= (50 if era_label == "1_Late_Medieval" else (30 if is_hard_era else 10)):
276
- break
277
-
278
  if results_found == 0:
279
  print(f" ⚠️ No results found for this query")
280
 
@@ -284,6 +297,7 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
284
 
285
  print(f"Completed {era_label}: {collected}/{books_per_era} books collected")
286
 
 
287
  if era_label == "1_Late_Medieval" and collected < books_per_era * 0.3:
288
  print(f"\n⚠️ EMERGENCY FALLBACK MODE for {era_label}")
289
  fallback_attempts = 0
@@ -300,10 +314,17 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
300
  print(f" > 🚨 Fallback #{fallback_attempts}: {term}")
301
 
302
  try:
303
- search = internetarchive.search_items(query, sorts=['downloads desc'], fields=['identifier', 'title', 'date', 'year'])
 
 
 
 
 
 
 
304
  checked = 0
305
- for res in search:
306
- if collected >= books_per_era or checked >= 100:
307
  break
308
  checked += 1
309
 
@@ -324,6 +345,8 @@ def generate_dataset(total_books_needed, progress=gr.Progress()):
324
  except Exception as e:
325
  print(f" ❌ Fallback error: {e}")
326
  time.sleep(1)
 
 
327
 
328
  if not records: return None, pd.DataFrame(), pd.DataFrame()
329
 
@@ -457,7 +480,6 @@ def train_model(dataset_path, epochs, batch_size, learning_rate, progress=gr.Pro
457
  )
458
  MODEL.to(DEVICE)
459
 
460
- # FIX: Ensure we use the AdamW imported from torch.optim
461
  optimizer = AdamW(MODEL.parameters(), lr=learning_rate)
462
  total_steps = len(train_loader) * epochs
463
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
 
13
  from torch.utils.data import Dataset, DataLoader
14
  from sklearn.model_selection import train_test_split
15
  import numpy as np
16
+ import nest_asyncio # ⭐️ FIX 2: Added nest_asyncio for stability
17
+ import sys
18
+
19
+ # --- SYSTEM FIXES ---
20
+ # ⭐️ FIX 2: Apply nest_asyncio to prevent EventLoop/Gradio conflicts (Invalid file descriptor: -1)
21
+ try:
22
+ nest_asyncio.apply()
23
+ except Exception as e:
24
+ print(f"Warning: Could not apply nest_asyncio: {e}")
25
 
26
  # --- CONFIGURATION ---
27
  DATASET_DIR = "dataset_ml_final_v2"
 
35
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
 
37
  try:
38
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, logging
 
 
 
 
 
39
  from torch.optim import AdamW
40
 
41
+ # Suppress heavy warnings from transformers
42
+ logging.set_verbosity_error()
43
+
44
  print("Attempting to load Longformer Tokenizer...")
45
  TOKENIZER = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
46
  print("✅ Tokenizer loaded successfully.")
47
  except Exception as e:
48
  print(f"⚠️ Tokenizer loading error: {e}")
 
49
  AdamW = None
50
 
51
  # --- ERAS (10 Distinct Periods) ---
 
160
  content = ""
161
  for url in urls:
162
  try:
163
+ r = requests.get(url, timeout=15) # Increased timeout for robustness
164
  if r.status_code == 200:
165
  content = r.text
166
  break
 
247
  print(f" > Standard Search #{attempts}: {topic}")
248
 
249
  try:
250
+ search_generator = internetarchive.search_items(
251
  query,
252
  sorts=['downloads desc'],
253
  fields=['identifier', 'title', 'date', 'year']
254
  )
255
 
256
+ # ⭐️ FIX 1: Pre-fetch a batch of results to close the search connection quickly
257
+ search_results_batch = []
258
+ # Check a reasonable number of items before going back to the search
259
+ max_check_per_query = (50 if era_label == "1_Late_Medieval" else (30 if is_hard_era else 10))
260
+ for i, item in enumerate(search_generator):
261
+ search_results_batch.append(item)
262
+ if i >= max_check_per_query: break
263
+
264
+ results_found = len(search_results_batch)
265
+
266
+ # Now iterate through the SAFE pre-fetched list
267
+ for res in search_results_batch:
268
  if collected >= books_per_era: break
 
269
 
270
  id_ = res.get('identifier')
271
  raw_date = res.get('date') or res.get('year')
 
280
  if any(r['filename'].endswith(f"{id_}.txt") for r in records):
281
  continue
282
 
283
+ # The slow operation (download) is now outside the generator iteration
284
  rec = download_book(id_, res.get('title', 'Unknown'), year, era_label, min_char_limit=min_chars)
285
  if rec:
286
  rec['topic'] = "Classic" if using_rescue else topic
 
288
  collected += 1
289
  print(f" ✅ Saved ({collected}/{books_per_era}): {rec['title']} ({year})")
290
 
 
 
 
291
  if results_found == 0:
292
  print(f" ⚠️ No results found for this query")
293
 
 
297
 
298
  print(f"Completed {era_label}: {collected}/{books_per_era} books collected")
299
 
300
+ # ... (rest of the fallback logic remains the same) ...
301
  if era_label == "1_Late_Medieval" and collected < books_per_era * 0.3:
302
  print(f"\n⚠️ EMERGENCY FALLBACK MODE for {era_label}")
303
  fallback_attempts = 0
 
314
  print(f" > 🚨 Fallback #{fallback_attempts}: {term}")
315
 
316
  try:
317
+ search_generator = internetarchive.search_items(query, sorts=['downloads desc'], fields=['identifier', 'title', 'date', 'year'])
318
+
319
+ # Pre-fetch for fallback as well
320
+ fallback_batch = []
321
+ for i, item in enumerate(search_generator):
322
+ fallback_batch.append(item)
323
+ if i >= 100: break # Increased limit for fallback
324
+
325
  checked = 0
326
+ for res in fallback_batch:
327
+ if collected >= books_per_era:
328
  break
329
  checked += 1
330
 
 
345
  except Exception as e:
346
  print(f" ❌ Fallback error: {e}")
347
  time.sleep(1)
348
+ # ... (end of fallback logic) ...
349
+
350
 
351
  if not records: return None, pd.DataFrame(), pd.DataFrame()
352
 
 
480
  )
481
  MODEL.to(DEVICE)
482
 
 
483
  optimizer = AdamW(MODEL.parameters(), lr=learning_rate)
484
  total_steps = len(train_loader) * epochs
485
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)