Spaces:
Running
Running
Asish Karthikeya Gogineni commited on
Commit ·
48661b1
1
Parent(s): e5ab135
fix: Robust GitHub URL sanitization
Browse files- Automatically strips trailing backslashes (\) and slashes (/) from inputs
- Handles common copy-paste errors (e.g. https://.../repo\)
- Removes .git suffix if present
code_chatbot/universal_ingestor.py
CHANGED
|
@@ -56,15 +56,21 @@ class UniversalIngestor(DataManager):
|
|
| 56 |
|
| 57 |
def _detect_handler(self) -> DataManager:
|
| 58 |
"""Detects the type of input and returns the appropriate handler."""
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
# Check if it's a URL
|
| 62 |
if self._is_url(source):
|
| 63 |
-
if "github.com" in source or source.count("/") == 1 and "/" in source:
|
| 64 |
# GitHub URL or repo ID (owner/repo)
|
| 65 |
if "github.com" in source:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Extract repo_id from URL
|
| 67 |
parts = urlparse(source).path.strip("/").split("/")
|
|
|
|
| 68 |
if len(parts) >= 2:
|
| 69 |
repo_id = f"{parts[0]}/{parts[1]}"
|
| 70 |
else:
|
|
|
|
| 56 |
|
| 57 |
def _detect_handler(self) -> DataManager:
|
| 58 |
"""Detects the type of input and returns the appropriate handler."""
|
| 59 |
+
# Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
|
| 60 |
+
source = self.source.strip().strip('\\').strip('/')
|
| 61 |
|
| 62 |
# Check if it's a URL
|
| 63 |
if self._is_url(source):
|
| 64 |
+
if "github.com" in source or (source.count("/") == 1 and "/" in source):
|
| 65 |
# GitHub URL or repo ID (owner/repo)
|
| 66 |
if "github.com" in source:
|
| 67 |
+
# Remove .git suffix if present
|
| 68 |
+
if source.endswith(".git"):
|
| 69 |
+
source = source[:-4]
|
| 70 |
+
|
| 71 |
# Extract repo_id from URL
|
| 72 |
parts = urlparse(source).path.strip("/").split("/")
|
| 73 |
+
# Handle cases like https://github.com/owner/repo/tree/main/...
|
| 74 |
if len(parts) >= 2:
|
| 75 |
repo_id = f"{parts[0]}/{parts[1]}"
|
| 76 |
else:
|