Asish Karthikeya Gogineni commited on
Commit
48661b1
·
1 Parent(s): e5ab135

fix: Robust GitHub URL sanitization

Browse files

- Automatically strips trailing backslashes (\) and slashes (/) from inputs
- Handles common copy-paste errors (e.g. https://.../repo\)
- Removes .git suffix if present

Files changed (1) hide show
  1. code_chatbot/universal_ingestor.py +8 -2
code_chatbot/universal_ingestor.py CHANGED
@@ -56,15 +56,21 @@ class UniversalIngestor(DataManager):
56
 
57
  def _detect_handler(self) -> DataManager:
58
  """Detects the type of input and returns the appropriate handler."""
59
- source = self.source.strip()
 
60
 
61
  # Check if it's a URL
62
  if self._is_url(source):
63
- if "github.com" in source or source.count("/") == 1 and "/" in source:
64
  # GitHub URL or repo ID (owner/repo)
65
  if "github.com" in source:
 
 
 
 
66
  # Extract repo_id from URL
67
  parts = urlparse(source).path.strip("/").split("/")
 
68
  if len(parts) >= 2:
69
  repo_id = f"{parts[0]}/{parts[1]}"
70
  else:
 
56
 
57
  def _detect_handler(self) -> DataManager:
58
  """Detects the type of input and returns the appropriate handler."""
59
+ # Aggressive cleaning: strip whitespace, backslashes, and trailing slashes
60
+ source = self.source.strip().strip('\\').strip('/')
61
 
62
  # Check if it's a URL
63
  if self._is_url(source):
64
+ if "github.com" in source or (source.count("/") == 1 and "/" in source):
65
  # GitHub URL or repo ID (owner/repo)
66
  if "github.com" in source:
67
+ # Remove .git suffix if present
68
+ if source.endswith(".git"):
69
+ source = source[:-4]
70
+
71
  # Extract repo_id from URL
72
  parts = urlparse(source).path.strip("/").split("/")
73
+ # Handle cases like https://github.com/owner/repo/tree/main/...
74
  if len(parts) >= 2:
75
  repo_id = f"{parts[0]}/{parts[1]}"
76
  else: