import os import sys import subprocess from pathlib import Path # 1. Retrieve the GitHub token # Configured default model: ai4data/datause-extraction-v1 (with sample_pdfs fallback) token = os.environ.get("GITHUB_TOKEN") or os.environ.get("PAT") if not token: raise ValueError("Please set the GITHUB_TOKEN or PAT secret in Space settings.") # 2. Clone/Pull private repository repo_url = f"https://oauth2:{token}@github.com/rafmacalaba/monitoring_of_datause.git" repo_dir = Path("monitoring_of_datause") if not repo_dir.exists(): print("Cloning private repository...") try: subprocess.run(["git", "clone", repo_url, str(repo_dir)], check=True, capture_output=True, text=True) except subprocess.CalledProcessError as e: stderr_clean = e.stderr.replace(token, "********") if token else e.stderr raise RuntimeError(f"Failed to clone repository: {stderr_clean}") from None else: print("Repository exists. Pulling updates...") try: subprocess.run(["git", "-C", str(repo_dir), "pull"], check=True, capture_output=True, text=True) except subprocess.CalledProcessError as e: stderr_clean = e.stderr.replace(token, "********") if token else e.stderr raise RuntimeError(f"Failed to pull repository: {stderr_clean}") from None # 3. Explicitly install local gliner2 first (since standard pip doesn't read uv.sources) gliner2_path = repo_dir / "gliner2-src" if gliner2_path.exists(): print("Installing local gliner2 library...") subprocess.run([sys.executable, "-m", "pip", "install", "-e", str(gliner2_path)], check=True) # 4. Install other dependencies and project package requirements_file = repo_dir / "requirements.txt" if requirements_file.exists(): print("Installing dependencies...") subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], cwd=str(repo_dir), check=True) # 4.5. Pre-download models to cache during startup print("Pre-downloading models to local cache...") try: from huggingface_hub import snapshot_download snapshot_download(repo_id="fastino/gliner2-large-v1") snapshot_download(repo_id="ai4data/datause-extraction-v1") snapshot_download(repo_id="ai4data-use/bert-base-uncased-data-use") print("Models pre-downloaded successfully.") except Exception as e: print(f"Warning: Failed to pre-download models: {e}") # 5. Add to Python path and launch the app sys.path.insert(0, str(repo_dir)) sys.path.insert(0, str(repo_dir / "src")) sys.path.insert(0, str(repo_dir / "datause_extract")) import gradio as gr from app import demo, CUSTOM_CSS if __name__ == "__main__": # Pre-load models into RAM to avoid first-use delay print("Pre-loading models into RAM...") try: from ai4data import DatasetExtractor extractor = DatasetExtractor() _ = extractor.model _ = extractor.classifier print("Models successfully pre-loaded into RAM.") except Exception as e: print(f"Warning: Failed to pre-load models: {e}") theme = gr.themes.Base( primary_hue="slate", neutral_hue="slate", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], ) demo.launch(css=CUSTOM_CSS, theme=theme)