Asish Karthikeya Gogineni commited on
Commit
da942b7
·
1 Parent(s): d79ce72

fix: Smart extraction of GitHub URLs from mixed input

Browse files

- Added regex matching to find https://github.com/... inside error messages
- Handles cases where users paste 'Error: Failed to download... <URL>'
- Retains previous sanitization (strip backslashes/.git)

Files changed (1) hide show
  1. code_chatbot/universal_ingestor.py +10 -0
code_chatbot/universal_ingestor.py CHANGED
@@ -54,11 +54,21 @@ class UniversalIngestor(DataManager):
54
  self.local_dir = local_dir or os.path.join(tempfile.gettempdir(), "code_chatbot")
55
  self.delegate = self._detect_handler()
56
 
 
57
  def _detect_handler(self) -> DataManager:
58
  """Detects the type of input and returns the appropriate handler."""
59
  # Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
60
  source = self.source.strip().strip('\\').strip('/')
61
 
 
 
 
 
 
 
 
 
 
62
  # Check if it's a URL
63
  if self._is_url(source):
64
  if "github.com" in source or (source.count("/") == 1 and "/" in source):
 
54
  self.local_dir = local_dir or os.path.join(tempfile.gettempdir(), "code_chatbot")
55
  self.delegate = self._detect_handler()
56
 
57
+
58
  def _detect_handler(self) -> DataManager:
59
  """Detects the type of input and returns the appropriate handler."""
60
  # Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
61
  source = self.source.strip().strip('\\').strip('/')
62
 
63
+ # Smart Extraction: If input looks like garbage (has spaces, long text), try to find a URL inside it
64
+ if "github.com" in source and (" " in source or "\n" in source or "Error" in source):
65
+ import re
66
+ # Regex to find https://github.com/owner/repo
67
+ match = re.search(r'(https?://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_\-\.]+)', source)
68
+ if match:
69
+ logger.info(f"Extracted GitHub URL from text: {match.group(1)}")
70
+ source = match.group(1).strip().strip('\\').strip('/')
71
+
72
  # Check if it's a URL
73
  if self._is_url(source):
74
  if "github.com" in source or (source.count("/") == 1 and "/" in source):