Spaces:
Running
Running
Asish Karthikeya Gogineni commited on
Commit ·
da942b7
1
Parent(s): d79ce72
fix: Smart extraction of GitHub URLs from mixed input
Browse files- Added regex matching to find https://github.com/... inside error messages
- Handles cases where users paste 'Error: Failed to download... <URL>'
- Retains previous sanitization (strip backslashes/.git)
code_chatbot/universal_ingestor.py
CHANGED
|
@@ -54,11 +54,21 @@ class UniversalIngestor(DataManager):
|
|
| 54 |
self.local_dir = local_dir or os.path.join(tempfile.gettempdir(), "code_chatbot")
|
| 55 |
self.delegate = self._detect_handler()
|
| 56 |
|
|
|
|
| 57 |
def _detect_handler(self) -> DataManager:
|
| 58 |
"""Detects the type of input and returns the appropriate handler."""
|
| 59 |
# Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
|
| 60 |
source = self.source.strip().strip('\\').strip('/')
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# Check if it's a URL
|
| 63 |
if self._is_url(source):
|
| 64 |
if "github.com" in source or (source.count("/") == 1 and "/" in source):
|
|
|
|
| 54 |
self.local_dir = local_dir or os.path.join(tempfile.gettempdir(), "code_chatbot")
|
| 55 |
self.delegate = self._detect_handler()
|
| 56 |
|
| 57 |
+
|
| 58 |
def _detect_handler(self) -> DataManager:
|
| 59 |
"""Detects the type of input and returns the appropriate handler."""
|
| 60 |
# Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
|
| 61 |
source = self.source.strip().strip('\\').strip('/')
|
| 62 |
|
| 63 |
+
# Smart Extraction: If input looks like garbage (has spaces, long text), try to find a URL inside it
|
| 64 |
+
if "github.com" in source and (" " in source or "\n" in source or "Error" in source):
|
| 65 |
+
import re
|
| 66 |
+
# Regex to find https://github.com/owner/repo
|
| 67 |
+
match = re.search(r'(https?://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_\-\.]+)', source)
|
| 68 |
+
if match:
|
| 69 |
+
logger.info(f"Extracted GitHub URL from text: {match.group(1)}")
|
| 70 |
+
source = match.group(1).strip().strip('\\').strip('/')
|
| 71 |
+
|
| 72 |
# Check if it's a URL
|
| 73 |
if self._is_url(source):
|
| 74 |
if "github.com" in source or (source.count("/") == 1 and "/" in source):
|