File size: 3,998 Bytes
7b4f5dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Hugging Face repository connector.
Clones a public Hugging Face space/model/dataset to a temporary local directory
and returns the path for downstream parsing.
"""
from __future__ import annotations

import logging
import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

# Regex for validating Hugging Face URLs
HF_URL_RE = re.compile(
    r"^https?://huggingface\.co/(?P<type>spaces/)?(?P<owner>[A-Za-z0-9_.\-]+)/(?P<repo>[A-Za-z0-9_.\-]+?)(?:\.git)?(?:/.*)?$"
)


def _validate_hf_url(url: str) -> re.Match:
    """Raise ValueError if the URL is not a valid public Hugging Face URL."""
    match = HF_URL_RE.match(url.strip())
    if not match:
        raise ValueError(
            f"Invalid Hugging Face URL: {url!r}. "
            "Expected format: https://huggingface.co/[spaces/]<owner>/<repo>"
        )
    return match


def clone_repo(url: str, target_dir: Optional[str] = None) -> str:
    """
    Clone a Hugging Face repository into *target_dir* (or a temp dir).
    Returns the path to the cloned repository root.

    Raises:
        ValueError: If the URL is invalid.
        RuntimeError: If git clone fails.
    """
    match = _validate_hf_url(url)
    repo_type = match.group("type") or ""
    owner = match.group("owner")
    repo = match.group("repo")

    # Build a clean clone URL
    clone_url = f"https://huggingface.co/{repo_type}{owner}/{repo}"

    if target_dir is None:
        target_dir = tempfile.mkdtemp(prefix="codesentry_hf_")

    dest = os.path.join(target_dir, repo)
    logger.info("Cloning %s → %s", clone_url, dest)

    # Use gitpython if available, fall back to subprocess
    try:
        import git  # type: ignore

        git.Repo.clone_from(
            clone_url,
            dest,
            depth=1,  # shallow clone — we only need the code, not history
            no_single_branch=True,
        )
    except ImportError:
        import subprocess  # noqa: S404

        result = subprocess.run(  # noqa: S603 S607
            ["git", "clone", "--depth", "1", clone_url, dest],
            capture_output=True,
            text=True,
            timeout=120,
        )
        if result.returncode != 0:
            raise RuntimeError(
                f"git clone failed (exit {result.returncode}): {result.stderr.strip()}"
            )

    return dest


def cleanup_repo(path: str) -> None:
    """Remove a cloned repository directory from disk."""
    try:
        shutil.rmtree(path, ignore_errors=True)
        logger.debug("Cleaned up HF repo dir: %s", path)
    except Exception as exc:
        logger.warning("Failed to clean up %s: %s", path, exc)


def get_repo_info(url: str) -> dict:
    """Extract owner and repo name from a Hugging Face URL without cloning."""
    match = _validate_hf_url(url)
    repo_type = match.group("type") or ""
    owner = match.group("owner")
    repo = match.group("repo")
    return {
        "owner": owner,
        "repo": repo,
        "clone_url": f"https://huggingface.co/{repo_type}{owner}/{repo}",
    }


class HuggingFaceConnector:
    """
    Context-manager wrapper around clone/cleanup.

    Usage::

        async with HuggingFaceConnector("https://huggingface.co/spaces/foo/bar") as repo_dir:
            files = parse_directory(repo_dir)
    """

    def __init__(self, url: str) -> None:
        self.url = url
        self._repo_dir: Optional[str] = None
        self._tmp_dir: Optional[str] = None

    def __enter__(self) -> str:
        self._tmp_dir = tempfile.mkdtemp(prefix="codesentry_hf_")
        self._repo_dir = clone_repo(self.url, target_dir=self._tmp_dir)
        return self._repo_dir

    def __exit__(self, *_: object) -> None:
        if self._tmp_dir:
            cleanup_repo(self._tmp_dir)

    # Async support
    async def __aenter__(self) -> str:
        return self.__enter__()

    async def __aexit__(self, *args: object) -> None:
        self.__exit__(*args)