factorstudios commited on
Commit
bc7d51f
Β·
verified Β·
1 Parent(s): 2133b69

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +75 -13
server.py CHANGED
@@ -1,8 +1,11 @@
1
  #!/usr/bin/env python3
2
  import os
 
3
  import json
4
  import re
5
  import asyncio
 
 
6
  from pathlib import Path
7
  from datetime import datetime
8
  from dotenv import load_dotenv
@@ -271,15 +274,28 @@ async def scan_and_process_highlights():
271
  print("="*80)
272
 
273
  try:
274
- # List all transcription files
275
- print(f"Scanning {HF_DATASET_REPO}/{TRANSCRIPTION_FOLDER}/ for transcription files...")
276
-
277
  files = list_repo_files(
278
  repo_id=HF_DATASET_REPO,
279
  repo_type="dataset",
280
  token=HF_TOKEN
281
  )
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  transcript_files = [
284
  f for f in files
285
  if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
@@ -289,15 +305,45 @@ async def scan_and_process_highlights():
289
 
290
  if not transcript_files:
291
  print("No transcription files found to process")
 
292
  return
293
 
294
- # Process each transcription
 
295
  for transcript_file in transcript_files:
296
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  # Download transcript
298
  local_path = hf_hub_download(
299
  repo_id=HF_DATASET_REPO,
300
- filename=transcript_file,
301
  repo_type="dataset",
302
  token=HF_TOKEN,
303
  cache_dir="/tmp/highlight_transcripts"
@@ -307,13 +353,12 @@ async def scan_and_process_highlights():
307
  with open(local_path, 'r', encoding='utf-8') as f:
308
  transcript_content = f.read()
309
 
310
- # Extract just the filename
311
- just_filename = os.path.basename(transcript_file)
312
 
313
  # Process for highlights
314
  await process_transcription_for_highlights(
315
  HF_DATASET_REPO,
316
- just_filename,
317
  transcript_content
318
  )
319
 
@@ -321,7 +366,7 @@ async def scan_and_process_highlights():
321
  await asyncio.sleep(2)
322
 
323
  except Exception as e:
324
- print(f"Error downloading {transcript_file}: {e}")
325
  processing_state["error_count"] += 1
326
  continue
327
 
@@ -339,8 +384,19 @@ async def scan_and_process_highlights():
339
 
340
  @app.on_event("startup")
341
  async def startup_event():
342
- """Start highlight extraction on server startup."""
343
- asyncio.create_task(scan_and_process_highlights())
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  @app.get("/")
346
  async def health():
@@ -365,7 +421,13 @@ async def trigger_extraction():
365
  "message": "Highlight extraction is already in progress"
366
  })
367
 
368
- asyncio.create_task(scan_and_process_highlights())
 
 
 
 
 
 
369
  return JSONResponse({
370
  "status": "started",
371
  "message": "Highlight extraction scan started"
@@ -384,6 +446,6 @@ async def get_status():
384
  })
385
 
386
  if __name__ == "__main__":
387
- print("Starting Movie Highlight Extraction Service on port 7861...")
388
  print("Will automatically scan and process transcriptions on startup")
389
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  #!/usr/bin/env python3
2
  import os
3
+ import sys
4
  import json
5
  import re
6
  import asyncio
7
+ import threading
8
+ import time
9
  from pathlib import Path
10
  from datetime import datetime
11
  from dotenv import load_dotenv
 
274
  print("="*80)
275
 
276
  try:
277
+ # List all files in repository
278
+ print(f"Connecting to {HF_DATASET_REPO}...")
 
279
  files = list_repo_files(
280
  repo_id=HF_DATASET_REPO,
281
  repo_type="dataset",
282
  token=HF_TOKEN
283
  )
284
 
285
+ # Get existing hook folders
286
+ print("Scanning for existing hook folders...")
287
+ existing_hooks = set()
288
+ for f in files:
289
+ if f.startswith(f"{HIGHLIGHTS_FOLDER}/"):
290
+ # Extract movie folder name
291
+ parts = f.split("/")
292
+ if len(parts) >= 2:
293
+ movie_name = parts[1]
294
+ existing_hooks.add(movie_name)
295
+
296
+ print(f"βœ“ Found {len(existing_hooks)} movie folders in hooks/: {existing_hooks}")
297
+
298
+ # Get all transcription files
299
  transcript_files = [
300
  f for f in files
301
  if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
 
305
 
306
  if not transcript_files:
307
  print("No transcription files found to process")
308
+ processing_state["is_running"] = False
309
  return
310
 
311
+ # Filter transcriptions to only those not yet processed
312
+ unprocessed_transcripts = []
313
  for transcript_file in transcript_files:
314
  try:
315
+ just_filename = os.path.basename(transcript_file)
316
+ movie_name = just_filename.replace(".transcript.txt", "").replace(".txt", "")
317
+
318
+ # Skip if hooks already exist for this movie
319
+ if movie_name in existing_hooks:
320
+ print(f" ⊘ {movie_name} (already has hooks)")
321
+ continue
322
+
323
+ unprocessed_transcripts.append({
324
+ "path": transcript_file,
325
+ "filename": just_filename,
326
+ "movie_name": movie_name
327
+ })
328
+ except Exception as e:
329
+ print(f"Error parsing transcript {transcript_file}: {e}")
330
+ continue
331
+
332
+ print(f"\nβœ“ Found {len(unprocessed_transcripts)} unprocessed movies")
333
+
334
+ if not unprocessed_transcripts:
335
+ print("βœ“ All transcriptions already have hooks!")
336
+ processing_state["is_running"] = False
337
+ return
338
+
339
+ # Process each unprocessed transcription
340
+ for transcript_info in unprocessed_transcripts:
341
+ try:
342
+ print(f"\nDownloading: {transcript_info['path']}")
343
  # Download transcript
344
  local_path = hf_hub_download(
345
  repo_id=HF_DATASET_REPO,
346
+ filename=transcript_info["path"],
347
  repo_type="dataset",
348
  token=HF_TOKEN,
349
  cache_dir="/tmp/highlight_transcripts"
 
353
  with open(local_path, 'r', encoding='utf-8') as f:
354
  transcript_content = f.read()
355
 
356
+ print(f"βœ“ Downloaded: {transcript_info['filename']}")
 
357
 
358
  # Process for highlights
359
  await process_transcription_for_highlights(
360
  HF_DATASET_REPO,
361
+ transcript_info["filename"],
362
  transcript_content
363
  )
364
 
 
366
  await asyncio.sleep(2)
367
 
368
  except Exception as e:
369
+ print(f"Error processing {transcript_info['path']}: {e}")
370
  processing_state["error_count"] += 1
371
  continue
372
 
 
384
 
385
  @app.on_event("startup")
386
  async def startup_event():
387
+ """Schedule highlight extraction on server startup with background thread."""
388
+ print("\n" + "="*80)
389
+ print("STARTUP EVENT TRIGGERED - Highlight Extraction Service")
390
+ print("="*80)
391
+
392
+ # Schedule scan in a background thread (more reliable for deployment)
393
+ def run_scan():
394
+ print("Starting highlight extraction scan...")
395
+ asyncio.run(scan_and_process_highlights())
396
+
397
+ scan_thread = threading.Thread(target=run_scan, daemon=True)
398
+ scan_thread.start()
399
+ print("βœ“ Background scan thread scheduled")
400
 
401
  @app.get("/")
402
  async def health():
 
421
  "message": "Highlight extraction is already in progress"
422
  })
423
 
424
+ # Use threading for consistent behavior
425
+ def run_scan():
426
+ asyncio.run(scan_and_process_highlights())
427
+
428
+ scan_thread = threading.Thread(target=run_scan, daemon=True)
429
+ scan_thread.start()
430
+
431
  return JSONResponse({
432
  "status": "started",
433
  "message": "Highlight extraction scan started"
 
446
  })
447
 
448
  if __name__ == "__main__":
449
+ print("Starting Movie Highlight Extraction Service on port 7860...")
450
  print("Will automatically scan and process transcriptions on startup")
451
  uvicorn.run(app, host="0.0.0.0", port=7860)