Spaces:
Running
Running
File size: 3,998 Bytes
7b4f5dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | """
Hugging Face repository connector.
Clones a public Hugging Face space/model/dataset to a temporary local directory
and returns the path for downstream parsing.
"""
from __future__ import annotations
import logging
import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# Regex for validating Hugging Face URLs
HF_URL_RE = re.compile(
r"^https?://huggingface\.co/(?P<type>spaces/)?(?P<owner>[A-Za-z0-9_.\-]+)/(?P<repo>[A-Za-z0-9_.\-]+?)(?:\.git)?(?:/.*)?$"
)
def _validate_hf_url(url: str) -> re.Match:
"""Raise ValueError if the URL is not a valid public Hugging Face URL."""
match = HF_URL_RE.match(url.strip())
if not match:
raise ValueError(
f"Invalid Hugging Face URL: {url!r}. "
"Expected format: https://huggingface.co/[spaces/]<owner>/<repo>"
)
return match
def clone_repo(url: str, target_dir: Optional[str] = None) -> str:
"""
Clone a Hugging Face repository into *target_dir* (or a temp dir).
Returns the path to the cloned repository root.
Raises:
ValueError: If the URL is invalid.
RuntimeError: If git clone fails.
"""
match = _validate_hf_url(url)
repo_type = match.group("type") or ""
owner = match.group("owner")
repo = match.group("repo")
# Build a clean clone URL
clone_url = f"https://huggingface.co/{repo_type}{owner}/{repo}"
if target_dir is None:
target_dir = tempfile.mkdtemp(prefix="codesentry_hf_")
dest = os.path.join(target_dir, repo)
logger.info("Cloning %s → %s", clone_url, dest)
# Use gitpython if available, fall back to subprocess
try:
import git # type: ignore
git.Repo.clone_from(
clone_url,
dest,
depth=1, # shallow clone — we only need the code, not history
no_single_branch=True,
)
except ImportError:
import subprocess # noqa: S404
result = subprocess.run( # noqa: S603 S607
["git", "clone", "--depth", "1", clone_url, dest],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
raise RuntimeError(
f"git clone failed (exit {result.returncode}): {result.stderr.strip()}"
)
return dest
def cleanup_repo(path: str) -> None:
"""Remove a cloned repository directory from disk."""
try:
shutil.rmtree(path, ignore_errors=True)
logger.debug("Cleaned up HF repo dir: %s", path)
except Exception as exc:
logger.warning("Failed to clean up %s: %s", path, exc)
def get_repo_info(url: str) -> dict:
"""Extract owner and repo name from a Hugging Face URL without cloning."""
match = _validate_hf_url(url)
repo_type = match.group("type") or ""
owner = match.group("owner")
repo = match.group("repo")
return {
"owner": owner,
"repo": repo,
"clone_url": f"https://huggingface.co/{repo_type}{owner}/{repo}",
}
class HuggingFaceConnector:
"""
Context-manager wrapper around clone/cleanup.
Usage::
async with HuggingFaceConnector("https://huggingface.co/spaces/foo/bar") as repo_dir:
files = parse_directory(repo_dir)
"""
def __init__(self, url: str) -> None:
self.url = url
self._repo_dir: Optional[str] = None
self._tmp_dir: Optional[str] = None
def __enter__(self) -> str:
self._tmp_dir = tempfile.mkdtemp(prefix="codesentry_hf_")
self._repo_dir = clone_repo(self.url, target_dir=self._tmp_dir)
return self._repo_dir
def __exit__(self, *_: object) -> None:
if self._tmp_dir:
cleanup_repo(self._tmp_dir)
# Async support
async def __aenter__(self) -> str:
return self.__enter__()
async def __aexit__(self, *args: object) -> None:
self.__exit__(*args)
|