Spaces:

factorstudios
/

lighter

Sleeping

App Files Files Community

factorstudios commited on 2 days ago

Commit

bc7d51f

verified ·

1 Parent(s): 2133b69

Update server.py

Browse files

Files changed (1) hide show

server.py +75 -13

server.py CHANGED Viewed

@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
 import os
 import json
 import re
 import asyncio
 from pathlib import Path
 from datetime import datetime
 from dotenv import load_dotenv
@@ -271,15 +274,28 @@ async def scan_and_process_highlights():
     print("="*80)
     try:
-        # List all transcription files
-        print(f"Scanning {HF_DATASET_REPO}/{TRANSCRIPTION_FOLDER}/ for transcription files...")
         files = list_repo_files(
             repo_id=HF_DATASET_REPO,
             repo_type="dataset",
             token=HF_TOKEN
         )
         transcript_files = [
             f for f in files
             if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
@@ -289,15 +305,45 @@ async def scan_and_process_highlights():
         if not transcript_files:
             print("No transcription files found to process")
             return
-        # Process each transcription
         for transcript_file in transcript_files:
             try:
                 # Download transcript
                 local_path = hf_hub_download(
                     repo_id=HF_DATASET_REPO,
-                    filename=transcript_file,
                     repo_type="dataset",
                     token=HF_TOKEN,
                     cache_dir="/tmp/highlight_transcripts"
@@ -307,13 +353,12 @@ async def scan_and_process_highlights():
                 with open(local_path, 'r', encoding='utf-8') as f:
                     transcript_content = f.read()
-                # Extract just the filename
-                just_filename = os.path.basename(transcript_file)
                 # Process for highlights
                 await process_transcription_for_highlights(
                     HF_DATASET_REPO,
-                    just_filename,
                     transcript_content
                 )
@@ -321,7 +366,7 @@ async def scan_and_process_highlights():
                 await asyncio.sleep(2)
             except Exception as e:
-                print(f"Error downloading {transcript_file}: {e}")
                 processing_state["error_count"] += 1
                 continue
@@ -339,8 +384,19 @@ async def scan_and_process_highlights():
 @app.on_event("startup")
 async def startup_event():
-    """Start highlight extraction on server startup."""
-    asyncio.create_task(scan_and_process_highlights())
 @app.get("/")
 async def health():
@@ -365,7 +421,13 @@ async def trigger_extraction():
             "message": "Highlight extraction is already in progress"
         })
-    asyncio.create_task(scan_and_process_highlights())
     return JSONResponse({
         "status": "started",
         "message": "Highlight extraction scan started"
@@ -384,6 +446,6 @@ async def get_status():
     })
 if __name__ == "__main__":
-    print("Starting Movie Highlight Extraction Service on port 7861...")
     print("Will automatically scan and process transcriptions on startup")
     uvicorn.run(app, host="0.0.0.0", port=7860)

 #!/usr/bin/env python3
 import os
+import sys
 import json
 import re
 import asyncio
+import threading
+import time
 from pathlib import Path
 from datetime import datetime
 from dotenv import load_dotenv
     print("="*80)
     try:
+        # List all files in repository
+        print(f"Connecting to {HF_DATASET_REPO}...")
         files = list_repo_files(
             repo_id=HF_DATASET_REPO,
             repo_type="dataset",
             token=HF_TOKEN
         )
+        # Get existing hook folders
+        print("Scanning for existing hook folders...")
+        existing_hooks = set()
+        for f in files:
+            if f.startswith(f"{HIGHLIGHTS_FOLDER}/"):
+                # Extract movie folder name
+                parts = f.split("/")
+                if len(parts) >= 2:
+                    movie_name = parts[1]
+                    existing_hooks.add(movie_name)
+        print(f"✓ Found {len(existing_hooks)} movie folders in hooks/: {existing_hooks}")
+        # Get all transcription files
         transcript_files = [
             f for f in files
             if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
         if not transcript_files:
             print("No transcription files found to process")
+            processing_state["is_running"] = False
             return
+        # Filter transcriptions to only those not yet processed
+        unprocessed_transcripts = []
         for transcript_file in transcript_files:
             try:
+                just_filename = os.path.basename(transcript_file)
+                movie_name = just_filename.replace(".transcript.txt", "").replace(".txt", "")
+                # Skip if hooks already exist for this movie
+                if movie_name in existing_hooks:
+                    print(f"  ⊘ {movie_name} (already has hooks)")
+                    continue
+                unprocessed_transcripts.append({
+                    "path": transcript_file,
+                    "filename": just_filename,
+                    "movie_name": movie_name
+                })
+            except Exception as e:
+                print(f"Error parsing transcript {transcript_file}: {e}")
+                continue
+        print(f"\n✓ Found {len(unprocessed_transcripts)} unprocessed movies")
+        if not unprocessed_transcripts:
+            print("✓ All transcriptions already have hooks!")
+            processing_state["is_running"] = False
+            return
+        # Process each unprocessed transcription
+        for transcript_info in unprocessed_transcripts:
+            try:
+                print(f"\nDownloading: {transcript_info['path']}")
                 # Download transcript
                 local_path = hf_hub_download(
                     repo_id=HF_DATASET_REPO,
+                    filename=transcript_info["path"],
                     repo_type="dataset",
                     token=HF_TOKEN,
                     cache_dir="/tmp/highlight_transcripts"
                 with open(local_path, 'r', encoding='utf-8') as f:
                     transcript_content = f.read()
+                print(f"✓ Downloaded: {transcript_info['filename']}")
                 # Process for highlights
                 await process_transcription_for_highlights(
                     HF_DATASET_REPO,
+                    transcript_info["filename"],
                     transcript_content
                 )
                 await asyncio.sleep(2)
             except Exception as e:
+                print(f"Error processing {transcript_info['path']}: {e}")
                 processing_state["error_count"] += 1
                 continue
 @app.on_event("startup")
 async def startup_event():
+    """Schedule highlight extraction on server startup with background thread."""
+    print("\n" + "="*80)
+    print("STARTUP EVENT TRIGGERED - Highlight Extraction Service")
+    print("="*80)
+    # Schedule scan in a background thread (more reliable for deployment)
+    def run_scan():
+        print("Starting highlight extraction scan...")
+        asyncio.run(scan_and_process_highlights())
+    scan_thread = threading.Thread(target=run_scan, daemon=True)
+    scan_thread.start()
+    print("✓ Background scan thread scheduled")
 @app.get("/")
 async def health():
             "message": "Highlight extraction is already in progress"
         })
+    # Use threading for consistent behavior
+    def run_scan():
+        asyncio.run(scan_and_process_highlights())
+    scan_thread = threading.Thread(target=run_scan, daemon=True)
+    scan_thread.start()
     return JSONResponse({
         "status": "started",
         "message": "Highlight extraction scan started"
     })
 if __name__ == "__main__":
+    print("Starting Movie Highlight Extraction Service on port 7860...")
     print("Will automatically scan and process transcriptions on startup")
     uvicorn.run(app, host="0.0.0.0", port=7860)