Spaces:

ai4data
/

data-use-extractor

Running

App Files Files Community

data-use-extractor / app.py

rafmacalaba

Upload folder using huggingface_hub

1baa73a verified 18 days ago

Raw

History Blame Contribute Delete

3.25 kB

	import os
	import sys
	import subprocess
	from pathlib import Path

	# 1. Retrieve the GitHub token
	# Configured default model: ai4data/datause-extraction-v1 (with sample_pdfs fallback)
	token = os.environ.get("GITHUB_TOKEN") or os.environ.get("PAT")
	if not token:
	raise ValueError("Please set the GITHUB_TOKEN or PAT secret in Space settings.")

	# 2. Clone/Pull private repository
	repo_url = f"https://oauth2:{token}@github.com/rafmacalaba/monitoring_of_datause.git"
	repo_dir = Path("monitoring_of_datause")

	if not repo_dir.exists():
	print("Cloning private repository...")
	try:
	subprocess.run(["git", "clone", repo_url, str(repo_dir)], check=True, capture_output=True, text=True)
	except subprocess.CalledProcessError as e:
	stderr_clean = e.stderr.replace(token, "********") if token else e.stderr
	raise RuntimeError(f"Failed to clone repository: {stderr_clean}") from None
	else:
	print("Repository exists. Pulling updates...")
	try:
	subprocess.run(["git", "-C", str(repo_dir), "pull"], check=True, capture_output=True, text=True)
	except subprocess.CalledProcessError as e:
	stderr_clean = e.stderr.replace(token, "********") if token else e.stderr
	raise RuntimeError(f"Failed to pull repository: {stderr_clean}") from None

	# 3. Explicitly install local gliner2 first (since standard pip doesn't read uv.sources)
	gliner2_path = repo_dir / "gliner2-src"
	if gliner2_path.exists():
	print("Installing local gliner2 library...")
	subprocess.run([sys.executable, "-m", "pip", "install", "-e", str(gliner2_path)], check=True)

	# 4. Install other dependencies and project package
	requirements_file = repo_dir / "requirements.txt"
	if requirements_file.exists():
	print("Installing dependencies...")
	subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], cwd=str(repo_dir), check=True)

	# 4.5. Pre-download models to cache during startup
	print("Pre-downloading models to local cache...")
	try:
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="fastino/gliner2-large-v1")
	snapshot_download(repo_id="ai4data/datause-extraction-v1")
	snapshot_download(repo_id="ai4data-use/bert-base-uncased-data-use")
	print("Models pre-downloaded successfully.")
	except Exception as e:
	print(f"Warning: Failed to pre-download models: {e}")

	# 5. Add to Python path and launch the app
	sys.path.insert(0, str(repo_dir))
	sys.path.insert(0, str(repo_dir / "src"))
	sys.path.insert(0, str(repo_dir / "datause_extract"))

	import gradio as gr
	from app import demo, CUSTOM_CSS

	if __name__ == "__main__":
	# Pre-load models into RAM to avoid first-use delay
	print("Pre-loading models into RAM...")
	try:
	from ai4data import DatasetExtractor
	extractor = DatasetExtractor()
	_ = extractor.model
	_ = extractor.classifier
	print("Models successfully pre-loaded into RAM.")
	except Exception as e:
	print(f"Warning: Failed to pre-load models: {e}")

	theme = gr.themes.Base(
	primary_hue="slate",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
	)
	demo.launch(css=CUSTOM_CSS, theme=theme)