Spaces:

Neshaki
/

ARQ-RAG-Turboquant

Running

File size: 2,591 Bytes

ba86059

import os
from dotenv import load_dotenv
from supabase import create_client, Client
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Load environment variables
load_dotenv()

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
BUCKET_NAME = "papers"
DOWNLOAD_DIR = "data/uploads"

if not SUPABASE_URL or not SUPABASE_KEY:
    print("❌ Error: SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY not found in .env")
    exit(1)

# Initialize Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

def download_file(file_info):
    """Function to download a single file, used by ThreadPoolExecutor"""
    file_name = file_info['name']
    
    # Skip directories or non-pdf files if any
    if not file_name or file_name.endswith('/') or not file_name.lower().endswith('.pdf'):
        return None
        
    dest_path = os.path.join(DOWNLOAD_DIR, file_name)
    
    # Skip if already exists
    if os.path.exists(dest_path):
        return f"Skipped: {file_name}"
        
    try:
        res = supabase.storage.from_(BUCKET_NAME).download(file_name)
        with open(dest_path, "wb") as f:
            f.write(res)
        return f"Downloaded: {file_name}"
    except Exception as e:
        return f"Error {file_name}: {e}"

def download_all_papers():
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    
    print(f"🔍 Listing files in bucket: {BUCKET_NAME}...")
    
    all_files = []
    limit = 100
    offset = 0
    
    while True:
        res = supabase.storage.from_(BUCKET_NAME).list(
            path="",
            options={
                "limit": limit,
                "offset": offset,
                "sortBy": {"column": "name", "order": "asc"}
            }
        )
        
        if not res:
            break
            
        all_files.extend(res)
        if len(res) < limit:
            break
            
        offset += limit
        print(f"   Found {len(all_files)} files so far...")

    total_files = len(all_files)
    print(f"✅ Total files to check: {total_files}")
    
    # Use ThreadPoolExecutor for concurrent downloads
    # Setting max_workers to 10 for a good balance between speed and stability
    print(f"🚀 Starting concurrent download (10 workers)...")
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Wrap with tqdm for progress bar
        list(tqdm(executor.map(download_file, all_files), total=total_files, desc="Syncing papers"))

if __name__ == "__main__":
    download_all_papers()
    print("\n✨ Sync completed!")