Spaces:

Nerdur
/

webui

Sleeping

App Files Files Community

webui / github_simple.json

Nerdur

Upload 21 files

02f33b7 verified about 1 month ago

Raw

History Blame Contribute Delete

75.5 kB

[{"id":"github_simple","name":"GitHub Simple","meta":{"description":"Simple full context repo tool.","manifest":{"title":"Advanced GitHub Repository RAG Filter","author":"pkeffect","author_url":"https://github.com/pkeffect","funding_url":"https://github.com/open-webui","version":"2.1","license":"MIT","description":"Precision GitHub repository filter with character-perfect reproduction and detailed metadata","requirements":"requests, sentence-transformers, scikit-learn, numpy"},"type":"filter"},"content":"\"\"\"\ntitle: Advanced GitHub Repository RAG Filter\nauthor: pkeffect\nauthor_url: https://github.com/pkeffect\nfunding_url: https://github.com/open-webui\nversion: 2.1\nlicense: MIT\ndescription: Precision GitHub repository filter with character-perfect reproduction and detailed metadata\nrequirements: requests, sentence-transformers, scikit-learn, numpy\n\"\"\"\n\nimport os\nimport base64\nimport json\nimport time\nimport hashlib\nimport pickle\nimport tempfile\nfrom datetime import datetime\nfrom typing import Optional, Dict, List, Any, Tuple\nfrom pydantic import BaseModel, Field\nimport asyncio\nimport re\n\n# Try to import required libraries with fallbacks\ntry:\n import requests\n\n HAS_REQUESTS = True\nexcept ImportError:\n HAS_REQUESTS = False\n print(\"Warning: requests library not available\")\n\ntry:\n from sentence_transformers import SentenceTransformer\n import numpy as np\n from sklearn.metrics.pairwise import cosine_similarity\n\n HAS_EMBEDDINGS = True\nexcept ImportError:\n HAS_EMBEDDINGS = False\n print(\"Warning: embedding libraries not available - semantic search disabled\")\n\n\nclass Filter:\n class Valves(BaseModel):\n # GitHub Configuration\n github_token: str = Field(\n default=\"\",\n description=\"GitHub Personal Access Token for API authentication\",\n )\n\n github_repo: str = Field(\n default=\"\",\n description=\"GitHub repository in format 'owner/repo' (e.g., 'microsoft/vscode')\",\n )\n\n github_branch: str = Field(\n default=\"main\", description=\"Branch to fetch files from\"\n )\n\n # File Processing\n max_file_size: int = Field(\n default=2097152, description=\"Maximum file size in bytes to include\" # 2MB\n )\n\n chunk_size: int = Field(\n default=1500,\n description=\"Size of text chunks for better context management\",\n )\n\n chunk_overlap: int = Field(\n default=200, description=\"Overlap between chunks to maintain context\"\n )\n\n # File Filtering\n included_extensions: str = Field(\n default=\".py,.js,.ts,.jsx,.tsx,.md,.txt,.json,.yaml,.yml,.toml,.cfg,.ini,.sh,.bash,.sql,.html,.css,.scss,.less,.vue,.svelte,.go,.rs,.java,.cpp,.c,.h,.php,.rb,.swift,.kt,.scala,.clj,.hs,.ml,.fs,.r,.m,.pl,.lua,.dart,.ex,.exs,.xml,.csv,.env,.gitignore,.dockerfile,.makefile,.cmake,.gradle,.pom,.config,.conf,.properties\",\n description=\"Comma-separated list of file extensions to include\",\n )\n\n excluded_extensions: str = Field(\n default=\".png,.jpg,.jpeg,.gif,.ico,.svg,.pdf,.zip,.tar,.gz,.bz2,.xz,.7z,.rar,.exe,.bin,.dll,.so,.dylib,.class,.jar,.war,.ear,.deb,.rpm,.dmg,.msi,.app,.lock,.log,.cache,.tmp,.temp,.backup,.bak,.swp,.swo,.DS_Store,.thumbs.db,.pyc,.pyo,.pyd,.o,.obj,.lib,.a,.la,.lo,.gcda,.gcno\",\n description=\"Comma-separated list of file extensions to exclude\",\n )\n\n excluded_dirs: str = Field(\n default=\"node_modules,.git,.vscode,.idea,dist,build,target,__pycache__,.pytest_cache,.tox,vendor,logs,tmp,temp,.next,coverage,.nyc_output,public/assets,static/assets,.sass-cache,.gradle,bin,obj,.vs,.vscode-test,.dart_tool,packages,.pub-cache,.flutter-plugins,.flutter-plugins-dependencies,Pods,DerivedData,.build,.swiftpm\",\n description=\"Comma-separated list of directories to exclude\",\n )\n\n # RAG Configuration\n enable_semantic_search: bool = Field(\n default=True, description=\"Enable semantic search using embeddings\"\n )\n\n top_k_results: int = Field(\n default=10,\n description=\"Number of most relevant chunks to include in context\",\n )\n\n similarity_threshold: float = Field(\n default=0.05, description=\"Minimum similarity score for chunks (0.0-1.0)\"\n )\n\n # Context Management\n max_context_length: int = Field(\n default=150000,\n description=\"Maximum context length in characters (set high for full reproduction)\",\n )\n\n context_mode: str = Field(\n default=\"smart\",\n description=\"Context injection mode: 'full' (all files), 'smart' (query-based), 'query-only' (only on questions)\",\n )\n\n preserve_exact_formatting: bool = Field(\n default=True,\n description=\"Preserve exact whitespace, tabs, and formatting for character-perfect reproduction\",\n )\n\n # Cache Management\n cache_duration: int = Field(\n default=7200, description=\"Cache duration in seconds\" # 2 hours\n )\n\n persistent_cache: bool = Field(\n default=True, description=\"Enable persistent cache storage\"\n )\n\n # Performance\n auto_load_on_startup: bool = Field(\n default=False, description=\"Automatically load repository on filter startup\"\n )\n\n rate_limit_delay: float = Field(\n default=0.05, description=\"Delay between GitHub API calls in seconds\"\n )\n\n # UI/UX\n show_detailed_file_tree: bool = Field(\n default=True, description=\"Show detailed file tree with complete metadata\"\n )\n\n show_loading_status: bool = Field(\n default=True, description=\"Show detailed loading progress in chat\"\n )\n\n debug_mode: bool = Field(\n default=False, description=\"Enable detailed debug logging\"\n )\n\n # Manual Cache Control\n enable_manual_purge: bool = Field(\n default=True,\n description=\"Enable manual cache purging via 'purge cache' or 'purge context' commands\",\n )\n\n class UserValves(BaseModel):\n enable_github_context: bool = Field(\n default=True, description=\"Enable GitHub repository context injection\"\n )\n\n auto_trigger_phrases: str = Field(\n default=\"analyze code,review repository,explain codebase,show me files,repo analysis,code review,examine code,inspect files,repository overview,codebase analysis\",\n description=\"Comma-separated phrases that auto-trigger repository loading\",\n )\n\n custom_system_prompt: str = Field(\n default=\"\",\n description=\"Custom system prompt to add with repository context\",\n )\n\n preferred_context_mode: str = Field(\n default=\"auto\",\n description=\"User's preferred context mode: 'auto', 'always', 'never', 'on-request'\",\n )\n\n show_file_metadata: bool = Field(\n default=True,\n description=\"Show detailed file metadata (size, lines, characters, etc.)\",\n )\n\n def __init__(self):\n self.valves = self.Valves()\n self.user_valves = self.UserValves()\n\n # Repository cache structure with detailed metadata\n self.repo_cache = {}\n self.embeddings_cache = {}\n self.file_tree_cache = \"\"\n self.detailed_tree_cache = \"\"\n self.cache_timestamp = 0\n self.repo_metadata = {}\n\n # Embedding model (lazy loaded)\n self.embeddings_model = None\n self.embedding_dimension = 384 # all-MiniLM-L6-v2 dimension\n\n # Cache file path\n self.cache_dir = os.path.join(tempfile.gettempdir(), \"openwebui_github_cache\")\n os.makedirs(self.cache_dir, exist_ok=True)\n\n # Load persistent cache if enabled\n if self.valves.persistent_cache:\n self._load_persistent_cache()\n\n def _get_cache_key(self) -> str:\n \"\"\"Generate cache key for current repository configuration\"\"\"\n return hashlib.md5(\n f\"{self.valves.github_repo}#{self.valves.github_branch}#{self.valves.chunk_size}\".encode()\n ).hexdigest()\n\n def _load_persistent_cache(self):\n \"\"\"Load cache from disk if available\"\"\"\n if not self.valves.persistent_cache:\n return\n\n try:\n cache_key = self._get_cache_key() if self.valves.github_repo else None\n if not cache_key:\n return\n\n cache_file = os.path.join(self.cache_dir, f\"{cache_key}.pkl\")\n\n if os.path.exists(cache_file):\n with open(cache_file, \"rb\") as f:\n cached_data = pickle.load(f)\n\n # Check if cache is still valid\n if (\n time.time() - cached_data.get(\"timestamp\", 0)\n < self.valves.cache_duration\n ):\n self.repo_cache = cached_data.get(\"repo_cache\", {})\n self.embeddings_cache = cached_data.get(\"embeddings_cache\", {})\n self.file_tree_cache = cached_data.get(\"file_tree_cache\", \"\")\n self.detailed_tree_cache = cached_data.get(\n \"detailed_tree_cache\", \"\"\n )\n self.repo_metadata = cached_data.get(\"repo_metadata\", {})\n self.cache_timestamp = cached_data.get(\"timestamp\", 0)\n\n if self.valves.debug_mode:\n print(f\"Loaded persistent cache: {len(self.repo_cache)} files\")\n\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"Error loading persistent cache: {e}\")\n\n def _save_persistent_cache(self):\n \"\"\"Save cache to disk\"\"\"\n if not self.valves.persistent_cache:\n return\n\n try:\n cache_key = self._get_cache_key()\n cache_file = os.path.join(self.cache_dir, f\"{cache_key}.pkl\")\n\n cached_data = {\n \"repo_cache\": self.repo_cache,\n \"embeddings_cache\": self.embeddings_cache,\n \"file_tree_cache\": self.file_tree_cache,\n \"detailed_tree_cache\": self.detailed_tree_cache,\n \"repo_metadata\": self.repo_metadata,\n \"timestamp\": self.cache_timestamp,\n }\n\n with open(cache_file, \"wb\") as f:\n pickle.dump(cached_data, f)\n\n if self.valves.debug_mode:\n print(f\"Saved persistent cache: {len(self.repo_cache)} files\")\n\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"Error saving persistent cache: {e}\")\n\n def _get_embeddings_model(self):\n \"\"\"Lazy load embeddings model\"\"\"\n if not HAS_EMBEDDINGS or not self.valves.enable_semantic_search:\n return None\n\n if self.embeddings_model is None:\n try:\n self.embeddings_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n if self.valves.debug_mode:\n print(\"Loaded embeddings model: all-MiniLM-L6-v2\")\n except Exception as e:\n print(f\"Error loading embeddings model: {e}\")\n return None\n\n return self.embeddings_model\n\n def _get_file_extensions(self, extension_string: str) -> set:\n \"\"\"Parse comma-separated extension string into set\"\"\"\n return {\n ext.strip().lower() for ext in extension_string.split(\",\") if ext.strip()\n }\n\n def _get_excluded_dirs(self) -> set:\n \"\"\"Get set of excluded directory names\"\"\"\n return {\n dir.strip().lower()\n for dir in self.valves.excluded_dirs.split(\",\")\n if dir.strip()\n }\n\n def _should_include_file(self, file_path: str, file_size: int) -> bool:\n \"\"\"Advanced file filtering logic\"\"\"\n # Check file size\n if file_size > self.valves.max_file_size:\n return False\n\n # Get file extension\n file_ext = os.path.splitext(file_path)[1].lower()\n filename_lower = os.path.basename(file_path).lower()\n\n # Special handling for extensionless files\n extensionless_files = {\n \"dockerfile\",\n \"makefile\",\n \"rakefile\",\n \"gemfile\",\n \"procfile\",\n \"vagrantfile\",\n \"jenkinsfile\",\n \"gulpfile\",\n \"gruntfile\",\n }\n\n # Check if explicitly excluded\n excluded_exts = self._get_file_extensions(self.valves.excluded_extensions)\n if file_ext in excluded_exts:\n return False\n\n # Check if explicitly included\n included_exts = self._get_file_extensions(self.valves.included_extensions)\n if included_exts:\n # Include if extension matches OR if it's a special extensionless file\n if (\n file_ext not in included_exts\n and filename_lower not in extensionless_files\n ):\n return False\n\n # Check directory exclusions\n path_parts = [part.lower() for part in file_path.split(\"/\")[:-1]]\n excluded_dirs = self._get_excluded_dirs()\n\n for part in path_parts:\n if part in excluded_dirs:\n return False\n\n # Additional smart filtering for common unwanted files\n skip_patterns = [\n \"package-lock.json\",\n \"yarn.lock\",\n \"composer.lock\",\n \"gemfile.lock\",\n \"pipfile.lock\",\n \"poetry.lock\",\n \".eslintcache\",\n \".stylelintcache\",\n \"npm-debug.log\",\n \"yarn-debug.log\",\n \"yarn-error.log\",\n \".env.local\",\n \".env.development.local\",\n \".env.test.local\",\n \".env.production.local\",\n ]\n\n for pattern in skip_patterns:\n if pattern in filename_lower:\n return False\n\n return True\n\n def _get_github_headers(self) -> dict:\n \"\"\"Get headers for GitHub API requests\"\"\"\n headers = {\n \"Accept\": \"application/vnd.github.v3+json\",\n \"User-Agent\": \"OpenWebUI-GitHub-RAG-Filter/2.1\",\n }\n\n if self.valves.github_token:\n headers[\"Authorization\"] = f\"token {self.valves.github_token}\"\n\n return headers\n\n def _analyze_file_content(self, content: str, file_path: str) -> Dict:\n \"\"\"Analyze file content for detailed metadata\"\"\"\n lines = content.split(\"\\n\")\n\n # Character analysis\n char_count = len(content)\n char_count_no_whitespace = len(re.sub(r\"\\s\", \"\", content))\n line_count = len(lines)\n non_empty_lines = len([line for line in lines if line.strip()])\n\n # Line length analysis\n line_lengths = [len(line) for line in lines]\n max_line_length = max(line_lengths) if line_lengths else 0\n avg_line_length = sum(line_lengths) / len(line_lengths) if line_lengths else 0\n\n # Indentation analysis\n indented_lines = len([line for line in lines if line.startswith((\" \", \"\\t\"))])\n tab_lines = len([line for line in lines if \"\\t\" in line])\n space_lines = len([line for line in lines if line.startswith(\" \")])\n\n # Content type detection\n file_ext = os.path.splitext(file_path)[1].lower()\n\n # Language-specific analysis\n analysis = {\n \"char_count\": char_count,\n \"char_count_no_whitespace\": char_count_no_whitespace,\n \"line_count\": line_count,\n \"non_empty_lines\": non_empty_lines,\n \"empty_lines\": line_count - non_empty_lines,\n \"max_line_length\": max_line_length,\n \"avg_line_length\": round(avg_line_length, 1),\n \"indented_lines\": indented_lines,\n \"tab_lines\": tab_lines,\n \"space_lines\": space_lines,\n \"whitespace_ratio\": (\n round((char_count - char_count_no_whitespace) / char_count * 100, 1)\n if char_count > 0\n else 0\n ),\n \"file_extension\": file_ext,\n \"estimated_encoding\": \"utf-8\", # GitHub API provides UTF-8\n }\n\n # Language-specific metrics\n if file_ext in [\".py\", \".pyx\", \".pyi\"]:\n analysis[\"language\"] = \"Python\"\n analysis[\"import_lines\"] = len(\n [\n line\n for line in lines\n if line.strip().startswith((\"import \", \"from \"))\n ]\n )\n analysis[\"comment_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"#\")]\n )\n analysis[\"docstring_lines\"] = len(\n [line for line in lines if '\"\"\"' in line or \"'''\" in line]\n )\n elif file_ext in [\".js\", \".jsx\", \".ts\", \".tsx\"]:\n analysis[\"language\"] = \"JavaScript/TypeScript\"\n analysis[\"import_lines\"] = len(\n [line for line in lines if \"import \" in line or \"require(\" in line]\n )\n analysis[\"comment_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"//\")]\n )\n analysis[\"function_lines\"] = len(\n [line for line in lines if \"function \" in line or \"=>\" in line]\n )\n elif file_ext in [\".java\", \".scala\", \".kt\"]:\n analysis[\"language\"] = \"JVM Language\"\n analysis[\"import_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"import \")]\n )\n analysis[\"comment_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"//\")]\n )\n analysis[\"class_lines\"] = len([line for line in lines if \"class \" in line])\n elif file_ext in [\".go\"]:\n analysis[\"language\"] = \"Go\"\n analysis[\"import_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"import \")]\n )\n analysis[\"comment_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"//\")]\n )\n analysis[\"func_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"func \")]\n )\n elif file_ext in [\".rs\"]:\n analysis[\"language\"] = \"Rust\"\n analysis[\"use_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"use \")]\n )\n analysis[\"comment_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"//\")]\n )\n analysis[\"fn_lines\"] = len([line for line in lines if \"fn \" in line])\n elif file_ext in [\".md\"]:\n analysis[\"language\"] = \"Markdown\"\n analysis[\"header_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"#\")]\n )\n analysis[\"code_block_lines\"] = len(\n [line for line in lines if line.strip().startswith(\"```\")]\n )\n analysis[\"link_lines\"] = len(\n [line for line in lines if \"[\" in line and \"](\" in line]\n )\n\n return analysis\n\n def _chunk_text(self, text: str, file_path: str) -> List[Dict]:\n \"\"\"Advanced text chunking with overlap and precise metadata\"\"\"\n chunks = []\n lines = text.split(\"\\n\")\n\n current_chunk = []\n current_size = 0\n start_line = 1\n\n for i, line in enumerate(lines, 1):\n current_chunk.append(line)\n current_size += len(line) + 1 # +1 for newline\n\n # Create chunk when size limit reached\n if current_size >= self.valves.chunk_size or i == len(lines):\n if current_chunk: # Only add non-empty chunks\n chunk_text = \"\\n\".join(current_chunk)\n\n chunks.append(\n {\n \"file_path\": file_path,\n \"content\": chunk_text,\n \"start_line\": start_line,\n \"end_line\": i,\n \"size\": len(chunk_text),\n \"line_count\": len(current_chunk),\n \"char_count\": len(chunk_text),\n \"id\": f\"{file_path}:{start_line}-{i}\",\n \"chunk_index\": len(chunks),\n }\n )\n\n # Handle overlap for next chunk\n if i < len(lines) and self.valves.chunk_overlap > 0:\n overlap_lines = []\n overlap_size = 0\n\n # Take last few lines for overlap\n for j in range(len(current_chunk) - 1, -1, -1):\n line = current_chunk[j]\n if overlap_size + len(line) <= self.valves.chunk_overlap:\n overlap_lines.insert(0, line)\n overlap_size += len(line) + 1\n else:\n break\n\n current_chunk = overlap_lines\n current_size = overlap_size\n start_line = i - len(overlap_lines) + 1\n else:\n current_chunk = []\n current_size = 0\n start_line = i + 1\n\n return chunks\n\n async def _get_repository_tree(self, __event_emitter__=None) -> Dict[str, Any]:\n \"\"\"Get repository tree with progress updates\"\"\"\n if not HAS_REQUESTS or not self.valves.github_repo:\n return {}\n\n try:\n if __event_emitter__ and self.valves.show_loading_status:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"📡 Fetching repository tree: {self.valves.github_repo}\",\n \"done\": False,\n },\n }\n )\n\n tree_url = f\"https://api.github.com/repos/{self.valves.github_repo}/git/trees/{self.valves.github_branch}?recursive=1\"\n\n response = requests.get(\n tree_url, headers=self._get_github_headers(), timeout=30\n )\n response.raise_for_status()\n\n tree_data = response.json()\n\n if __event_emitter__ and self.valves.show_loading_status:\n total_items = len(tree_data.get(\"tree\", []))\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"📊 Found {total_items} items in repository tree\",\n \"done\": False,\n },\n }\n )\n\n return tree_data\n\n except Exception as e:\n error_msg = f\"❌ Error fetching repository tree: {e}\"\n print(error_msg)\n\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": error_msg, \"done\": True}}\n )\n\n return {}\n\n async def _get_file_content(self, file_path: str) -> Optional[str]:\n \"\"\"Get file content from GitHub API with rate limiting and precise handling\"\"\"\n if not HAS_REQUESTS:\n return None\n\n try:\n # Rate limiting\n if self.valves.rate_limit_delay > 0:\n await asyncio.sleep(self.valves.rate_limit_delay)\n\n content_url = f\"https://api.github.com/repos/{self.valves.github_repo}/contents/{file_path}?ref={self.valves.github_branch}\"\n\n response = requests.get(\n content_url, headers=self._get_github_headers(), timeout=30\n )\n response.raise_for_status()\n\n file_data = response.json()\n\n if file_data.get(\"encoding\") == \"base64\":\n # Decode with precise handling to preserve all characters\n content_bytes = base64.b64decode(file_data[\"content\"])\n\n # Try UTF-8 first, then fall back to other encodings\n try:\n content = content_bytes.decode(\"utf-8\")\n except UnicodeDecodeError:\n try:\n content = content_bytes.decode(\"latin-1\")\n except UnicodeDecodeError:\n content = content_bytes.decode(\"utf-8\", errors=\"replace\")\n\n # Preserve exact formatting if enabled\n if self.valves.preserve_exact_formatting:\n # Don't strip or modify whitespace\n return content\n else:\n return content\n\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"❌ Error fetching file {file_path}: {e}\")\n\n return None\n\n def _build_detailed_directory_tree(self, tree_data: Dict) -> str:\n \"\"\"Build extremely detailed directory tree with full metadata\"\"\"\n if not tree_data.get(\"tree\"):\n return \"\"\n\n # Organize by directory structure\n dirs = {}\n files = {}\n\n # Separate directories and files\n for item in tree_data[\"tree\"]:\n if item[\"type\"] == \"tree\":\n dirs[item[\"path\"]] = item\n elif item[\"type\"] == \"blob\":\n dir_path = os.path.dirname(item[\"path\"])\n if dir_path not in files:\n files[dir_path] = []\n files[dir_path].append(item)\n\n tree_lines = []\n tree_lines.append(\"📁 DETAILED REPOSITORY DIRECTORY STRUCTURE\")\n tree_lines.append(\"═\" * 80)\n tree_lines.append(f\"Repository: {self.valves.github_repo}\")\n tree_lines.append(f\"Branch: {self.valves.github_branch}\")\n tree_lines.append(f\"Total Directories: {len(dirs)}\")\n tree_lines.append(\n f\"Total Files: {sum(len(file_list) for file_list in files.values())}\"\n )\n tree_lines.append(\"\")\n\n # Build hierarchical tree\n all_paths = sorted(set(list(dirs.keys()) + list(files.keys())))\n\n for path in all_paths:\n if path == \"\": # Root directory\n tree_lines.append(\"📂 ROOT/\")\n if \"\" in files:\n for file_item in sorted(files[\"\"], key=lambda x: x[\"path\"]):\n file_path = file_item[\"path\"]\n file_size = file_item.get(\"size\", 0)\n file_ext = os.path.splitext(file_path)[1] or \"no-ext\"\n\n # Add file metadata from cache if available\n metadata_str = f\"({file_size:,} bytes, {file_ext})\"\n if file_path in self.repo_cache:\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n if analysis:\n line_count = analysis.get(\"line_count\", 0)\n char_count = analysis.get(\"char_count\", 0)\n metadata_str = f\"({file_size:,} bytes, {line_count:,} lines, {char_count:,} chars, {file_ext})\"\n\n tree_lines.append(f\" ├── 📄 {file_path} {metadata_str}\")\n\n # Show included/excluded status\n if self._should_include_file(file_path, file_size):\n tree_lines.append(f\" ✅ INCLUDED in context\")\n else:\n tree_lines.append(f\" ❌ EXCLUDED from context\")\n else:\n # Directory\n depth = len(path.split(\"/\"))\n indent = \" \" * depth\n tree_lines.append(f\"{indent}📂 {os.path.basename(path)}/\")\n\n # Add files in this directory\n if path in files:\n for file_item in sorted(files[path], key=lambda x: x[\"path\"]):\n file_path = file_item[\"path\"]\n file_name = os.path.basename(file_path)\n file_size = file_item.get(\"size\", 0)\n file_ext = os.path.splitext(file_path)[1] or \"no-ext\"\n\n # Add detailed metadata from cache if available\n metadata_str = f\"({file_size:,} bytes, {file_ext})\"\n if file_path in self.repo_cache:\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n if analysis:\n line_count = analysis.get(\"line_count\", 0)\n char_count = analysis.get(\"char_count\", 0)\n chunks = analysis.get(\"chunks\", 0)\n language = analysis.get(\"language\", \"Unknown\")\n metadata_str = f\"({file_size:,} bytes, {line_count:,} lines, {char_count:,} chars, {chunks} chunks, {language}, {file_ext})\"\n\n tree_lines.append(\n f\"{indent} ├── 📄 {file_name} {metadata_str}\"\n )\n\n # Show SHA and inclusion status\n sha = file_item.get(\"sha\", \"\")[:8]\n if sha:\n tree_lines.append(f\"{indent} 🔗 SHA: {sha}\")\n\n if self._should_include_file(file_path, file_size):\n tree_lines.append(f\"{indent} ✅ INCLUDED in context\")\n else:\n reason = (\n \"size\"\n if file_size > self.valves.max_file_size\n else \"filtered\"\n )\n tree_lines.append(\n f\"{indent} ❌ EXCLUDED from context ({reason})\"\n )\n\n return \"\\n\".join(tree_lines)\n\n def _generate_file_summary_table(self) -> str:\n \"\"\"Generate detailed file summary table\"\"\"\n if not self.repo_cache:\n return \"\"\n\n lines = []\n lines.append(\"📊 FILE ANALYSIS SUMMARY TABLE\")\n lines.append(\"═\" * 120)\n lines.append(\n \"| FILE PATH | SIZE | LINES | CHARS | CHUNKS | LANGUAGE | EXTENSION | ENCODING |\"\n )\n lines.append(\n \"|-----------|------|-------|-------|--------|----------|-----------|----------|\"\n )\n\n for file_path in sorted(self.repo_cache.keys()):\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n\n size_str = f\"{file_data['size']:,} bytes\"\n lines_str = f\"{analysis.get('line_count', 0):,}\"\n chars_str = f\"{analysis.get('char_count', 0):,}\"\n chunks_str = f\"{file_data.get('chunks', 0)}\"\n lang_str = analysis.get(\"language\", \"Unknown\")[:10]\n ext_str = analysis.get(\"file_extension\", \"none\")\n enc_str = analysis.get(\"estimated_encoding\", \"utf-8\")\n\n # Truncate file path if too long\n display_path = (\n file_path if len(file_path) <= 40 else f\"...{file_path[-37:]}\"\n )\n\n lines.append(\n f\"| {display_path:<40} | {size_str:<8} | {lines_str:<5} | {chars_str:<8} | {chunks_str:<6} | {lang_str:<8} | {ext_str:<9} | {enc_str:<8} |\"\n )\n\n lines.append(\"\")\n lines.append(\n f\"TOTALS: {len(self.repo_cache)} files, {sum(f['size'] for f in self.repo_cache.values()):,} bytes, {sum(f.get('analysis', {}).get('line_count', 0) for f in self.repo_cache.values()):,} lines\"\n )\n\n return \"\\n\".join(lines)\n\n async def _generate_embeddings(self, chunks: List[Dict], __event_emitter__=None):\n \"\"\"Generate embeddings for chunks with precise progress\"\"\"\n if not self.valves.enable_semantic_search or not HAS_EMBEDDINGS:\n return\n\n model = self._get_embeddings_model()\n if not model:\n return\n\n try:\n if __event_emitter__ and self.valves.show_loading_status:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"🧠 Generating embeddings for {len(chunks):,} chunks\",\n \"done\": False,\n },\n }\n )\n\n # Prepare texts for embedding\n chunk_texts = []\n for chunk in chunks:\n # Create rich context for embedding\n context_text = f\"File: {chunk['file_path']}\\nLines: {chunk['start_line']}-{chunk['end_line']}\\n\\n{chunk['content']}\"\n chunk_texts.append(context_text)\n\n # Generate embeddings in batches\n batch_size = 16 # Smaller batches for better progress reporting\n all_embeddings = []\n\n for i in range(0, len(chunk_texts), batch_size):\n batch = chunk_texts[i : i + batch_size]\n batch_embeddings = model.encode(batch, show_progress_bar=False)\n all_embeddings.extend(batch_embeddings)\n\n # Progress update\n if __event_emitter__ and self.valves.show_loading_status:\n progress = min(100, int((i + len(batch)) / len(chunk_texts) * 100))\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"🧠 Generating embeddings: {progress}% ({i + len(batch):,}/{len(chunk_texts):,})\",\n \"done\": False,\n },\n }\n )\n\n # Store embeddings with metadata\n for i, chunk in enumerate(chunks):\n self.embeddings_cache[chunk[\"id\"]] = {\n \"embedding\": all_embeddings[i].tolist(),\n \"file_path\": chunk[\"file_path\"],\n \"start_line\": chunk[\"start_line\"],\n \"end_line\": chunk[\"end_line\"],\n \"size\": chunk[\"size\"],\n \"generated_at\": datetime.now().isoformat(),\n }\n\n if self.valves.debug_mode:\n print(f\"✅ Generated embeddings for {len(chunks):,} chunks\")\n\n except Exception as e:\n print(f\"❌ Error generating embeddings: {e}\")\n\n async def load_repository(self, __event_emitter__=None, force_reload=False) -> bool:\n \"\"\"Load repository with comprehensive progress updates and detailed metadata\"\"\"\n if not self.valves.github_repo:\n return False\n\n # Check if we need to reload\n if not force_reload and self._is_cache_valid():\n if self.valves.debug_mode:\n print(\"✅ Using cached repository data\")\n return True\n\n try:\n start_time = time.time()\n\n if __event_emitter__ and self.valves.show_loading_status:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"🚀 Loading repository: {self.valves.github_repo}\",\n \"done\": False,\n },\n }\n )\n\n # Get repository tree\n tree_data = await self._get_repository_tree(__event_emitter__)\n if not tree_data.get(\"tree\"):\n return False\n\n # Build detailed tree first\n self.detailed_tree_cache = self._build_detailed_directory_tree(tree_data)\n\n # Process files with detailed progress\n total_files = len(\n [item for item in tree_data[\"tree\"] if item[\"type\"] == \"blob\"]\n )\n files_processed = 0\n files_included = 0\n files_excluded = 0\n total_bytes = 0\n total_lines = 0\n total_chars = 0\n all_chunks = []\n\n # Clear existing cache\n self.repo_cache = {}\n self.embeddings_cache = {}\n\n if __event_emitter__ and self.valves.show_loading_status:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"📋 Processing {total_files:,} files\",\n \"done\": False,\n },\n }\n )\n\n # Process files\n for item in tree_data[\"tree\"]:\n if item[\"type\"] == \"blob\":\n files_processed += 1\n file_path = item[\"path\"]\n file_size = item.get(\"size\", 0)\n\n # Progress update every 10 files\n if (\n __event_emitter__\n and self.valves.show_loading_status\n and files_processed % 10 == 0\n ):\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"📄 Processing files: {files_processed:,}/{total_files:,} ({files_included:,} included, {files_excluded:,} excluded)\",\n \"done\": False,\n },\n }\n )\n\n # Check if file should be included\n if not self._should_include_file(file_path, file_size):\n files_excluded += 1\n continue\n\n # Get file content\n content = await self._get_file_content(file_path)\n if not content:\n files_excluded += 1\n continue\n\n # Analyze file content in detail\n analysis = self._analyze_file_content(content, file_path)\n\n # Create chunks\n chunks = self._chunk_text(content, file_path)\n all_chunks.extend(chunks)\n\n # Store file data with comprehensive metadata\n self.repo_cache[file_path] = {\n \"content\": content, # Exact character-perfect content\n \"size\": file_size,\n \"chunks\": len(chunks),\n \"sha\": item.get(\"sha\", \"\"),\n \"last_updated\": datetime.now().isoformat(),\n \"analysis\": analysis,\n \"github_url\": f\"https://github.com/{self.valves.github_repo}/blob/{self.valves.github_branch}/{file_path}\",\n \"raw_url\": f\"https://raw.githubusercontent.com/{self.valves.github_repo}/{self.valves.github_branch}/{file_path}\",\n }\n\n files_included += 1\n total_bytes += file_size\n total_lines += analysis.get(\"line_count\", 0)\n total_chars += analysis.get(\"char_count\", 0)\n\n # Generate embeddings with progress\n if all_chunks and self.valves.enable_semantic_search:\n await self._generate_embeddings(all_chunks, __event_emitter__)\n\n # Build simple file tree for backward compatibility\n file_tree_lines = [\n f\"Repository: {self.valves.github_repo} (branch: {self.valves.github_branch})\"\n ]\n file_tree_lines.append(\"═\" * 80)\n\n for file_path in sorted(self.repo_cache.keys()):\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n line_count = analysis.get(\"line_count\", 0)\n char_count = analysis.get(\"char_count\", 0)\n chunks = file_data.get(\"chunks\", 0)\n file_tree_lines.append(\n f\"📄 {file_path} ({file_data['size']:,} bytes, {line_count:,} lines, {char_count:,} chars, {chunks} chunks)\"\n )\n\n self.file_tree_cache = \"\\n\".join(file_tree_lines)\n\n # Store comprehensive metadata\n load_time = time.time() - start_time\n self.repo_metadata = {\n \"repo_url\": self.valves.github_repo,\n \"branch\": self.valves.github_branch,\n \"total_files_processed\": files_processed,\n \"total_files_included\": files_included,\n \"total_files_excluded\": files_excluded,\n \"total_chunks\": len(all_chunks),\n \"total_bytes\": total_bytes,\n \"total_lines\": total_lines,\n \"total_characters\": total_chars,\n \"load_time_seconds\": round(load_time, 2),\n \"files_per_second\": (\n round(files_processed / load_time, 1) if load_time > 0 else 0\n ),\n \"bytes_per_second\": (\n round(total_bytes / load_time, 0) if load_time > 0 else 0\n ),\n \"last_updated\": datetime.now().isoformat(),\n \"embeddings_enabled\": self.valves.enable_semantic_search\n and HAS_EMBEDDINGS,\n \"total_embeddings\": len(self.embeddings_cache),\n \"cache_key\": self._get_cache_key(),\n }\n\n self.cache_timestamp = time.time()\n\n # Save persistent cache\n self._save_persistent_cache()\n\n # Final comprehensive status update\n if __event_emitter__ and self.valves.show_loading_status:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"✅ Repository loaded: {files_included:,} files ({total_bytes:,} bytes, {total_lines:,} lines, {len(all_chunks):,} chunks) in {load_time:.1f}s\",\n \"done\": True,\n },\n }\n )\n\n print(\n f\"✅ Repository loaded successfully: {files_included:,} files, {len(all_chunks):,} chunks, {total_bytes:,} bytes\"\n )\n return True\n\n except Exception as e:\n error_msg = f\"❌ Error loading repository: {e}\"\n print(error_msg)\n\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": error_msg, \"done\": True}}\n )\n\n return False\n\n def _is_cache_valid(self) -> bool:\n \"\"\"Check if cache is still valid\"\"\"\n if not self.repo_cache:\n return False\n return (time.time() - self.cache_timestamp) < self.valves.cache_duration\n\n def _semantic_search(self, query: str) -> List[Dict]:\n \"\"\"Perform semantic search on repository chunks with detailed results\"\"\"\n if (\n not self.valves.enable_semantic_search\n or not HAS_EMBEDDINGS\n or not self.embeddings_cache\n ):\n return []\n\n model = self._get_embeddings_model()\n if not model:\n return []\n\n try:\n # Generate query embedding\n query_embedding = model.encode([query])\n\n # Get all chunk embeddings\n chunk_ids = list(self.embeddings_cache.keys())\n chunk_embeddings = np.array(\n [self.embeddings_cache[chunk_id][\"embedding\"] for chunk_id in chunk_ids]\n )\n\n # Calculate similarities\n similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]\n\n # Get top-k results above threshold\n results = []\n for i, similarity in enumerate(similarities):\n if similarity >= self.valves.similarity_threshold:\n chunk_id = chunk_ids[i]\n chunk_data = self.embeddings_cache[chunk_id]\n\n results.append(\n {\n \"chunk_id\": chunk_id,\n \"file_path\": chunk_data[\"file_path\"],\n \"start_line\": chunk_data[\"start_line\"],\n \"end_line\": chunk_data[\"end_line\"],\n \"similarity\": float(similarity),\n \"content\": self._get_chunk_content(chunk_id),\n \"size\": chunk_data[\"size\"],\n \"line_count\": chunk_data[\"end_line\"]\n - chunk_data[\"start_line\"]\n + 1,\n }\n )\n\n # Sort by similarity and return top-k\n results.sort(key=lambda x: x[\"similarity\"], reverse=True)\n return results[: self.valves.top_k_results]\n\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"❌ Error in semantic search: {e}\")\n return []\n\n def _get_chunk_content(self, chunk_id: str) -> str:\n \"\"\"Get content for a specific chunk with precise line extraction\"\"\"\n try:\n file_path, line_range = chunk_id.split(\":\")\n start_line, end_line = map(int, line_range.split(\"-\"))\n\n if file_path in self.repo_cache:\n content = self.repo_cache[file_path][\"content\"]\n lines = content.split(\"\\n\")\n chunk_lines = lines[start_line - 1 : end_line]\n return \"\\n\".join(chunk_lines)\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"❌ Error getting chunk content for {chunk_id}: {e}\")\n return \"\"\n\n def _build_context_from_search(self, query: str, user_valves) -> str:\n \"\"\"Build context using semantic search results with comprehensive metadata\"\"\"\n if not self.repo_cache:\n return \"\"\n\n context_parts = []\n show_metadata = getattr(user_valves, \"show_file_metadata\", True)\n\n # Header with query\n context_parts.append(\"🔍 REPOSITORY CONTEXT (Query-Based Semantic Search)\")\n context_parts.append(\"═\" * 100)\n context_parts.append(f\"Repository: {self.valves.github_repo}\")\n context_parts.append(f\"Branch: {self.valves.github_branch}\")\n context_parts.append(f'Search Query: \"{query}\"')\n context_parts.append(\"\")\n\n # Repository statistics\n if self.repo_metadata:\n context_parts.append(\"📊 REPOSITORY STATISTICS:\")\n context_parts.append(\n f\"• Total Files: {self.repo_metadata.get('total_files_included', 0):,}\"\n )\n context_parts.append(\n f\"• Total Size: {self.repo_metadata.get('total_bytes', 0):,} bytes\"\n )\n context_parts.append(\n f\"• Total Lines: {self.repo_metadata.get('total_lines', 0):,}\"\n )\n context_parts.append(\n f\"• Total Characters: {self.repo_metadata.get('total_characters', 0):,}\"\n )\n context_parts.append(\n f\"• Total Chunks: {self.repo_metadata.get('total_chunks', 0):,}\"\n )\n context_parts.append(\n f\"• Load Time: {self.repo_metadata.get('load_time_seconds', 0)} seconds\"\n )\n context_parts.append(\"\")\n\n # Semantic search results\n if self.valves.enable_semantic_search and HAS_EMBEDDINGS:\n search_results = self._semantic_search(query)\n\n if search_results:\n context_parts.append(\n f\"🎯 MOST RELEVANT CODE SECTIONS (Top {len(search_results)} of {len(self.embeddings_cache)} chunks):\"\n )\n context_parts.append(\"─\" * 80)\n\n for i, result in enumerate(search_results, 1):\n file_path = result[\"file_path\"]\n similarity = result[\"similarity\"]\n start_line = result[\"start_line\"]\n end_line = result[\"end_line\"]\n line_count = result[\"line_count\"]\n size = result[\"size\"]\n\n # File metadata header\n context_parts.append(f\"\\n[{i}] 📄 FILE: {file_path}\")\n context_parts.append(\n f\" 📊 Lines: {start_line:,}-{end_line:,} ({line_count:,} lines)\"\n )\n context_parts.append(f\" 📏 Size: {size:,} characters\")\n context_parts.append(f\" 🎯 Relevance: {similarity:.4f}\")\n\n # Add file-level metadata if available\n if show_metadata and file_path in self.repo_cache:\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n if analysis:\n context_parts.append(\n f\" 🔤 Total File Lines: {analysis.get('line_count', 0):,}\"\n )\n context_parts.append(\n f\" 📝 Language: {analysis.get('language', 'Unknown')}\"\n )\n context_parts.append(\n f\" 📎 Extension: {analysis.get('file_extension', 'none')}\"\n )\n context_parts.append(\n f\" 🔗 GitHub URL: {file_data.get('github_url', '')}\"\n )\n\n context_parts.append(f\" {'─' * 60}\")\n context_parts.append(\"```\")\n context_parts.append(result[\"content\"])\n context_parts.append(\"```\")\n context_parts.append(\"\")\n\n context_parts.append(\n f\"[Semantic search found {len(search_results)} relevant sections with similarity ≥ {self.valves.similarity_threshold}]\"\n )\n else:\n context_parts.append(\n f'❌ No highly relevant sections found for query: \"{query}\"'\n )\n context_parts.append(\n f\" (Searched {len(self.embeddings_cache):,} chunks with threshold ≥ {self.valves.similarity_threshold})\"\n )\n\n # Fallback to file listing\n context_parts.append(\"\\n📁 AVAILABLE FILES FOR REFERENCE:\")\n for i, file_path in enumerate(sorted(self.repo_cache.keys())[:15], 1):\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n size = file_data[\"size\"]\n lines = analysis.get(\"line_count\", 0)\n chars = analysis.get(\"char_count\", 0)\n context_parts.append(\n f\" [{i:2d}] 📄 {file_path} ({size:,} bytes, {lines:,} lines, {chars:,} chars)\"\n )\n\n if len(self.repo_cache) > 15:\n context_parts.append(\n f\" ... and {len(self.repo_cache) - 15:,} more files\"\n )\n else:\n # Fallback without semantic search\n context_parts.append(\"📁 REPOSITORY FILES (Semantic search disabled):\")\n context_parts.append(\"─\" * 80)\n for i, file_path in enumerate(sorted(self.repo_cache.keys())[:20], 1):\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n size = file_data[\"size\"]\n lines = analysis.get(\"line_count\", 0)\n chars = analysis.get(\"char_count\", 0)\n lang = analysis.get(\"language\", \"Unknown\")\n\n context_parts.append(f\"[{i:2d}] 📄 {file_path}\")\n if show_metadata:\n context_parts.append(\n f\" 📊 {size:,} bytes, {lines:,} lines, {chars:,} characters\"\n )\n context_parts.append(f\" 📝 Language: {lang}\")\n context_parts.append(f\" 🔗 {file_data.get('github_url', '')}\")\n\n if len(self.repo_cache) > 20:\n context_parts.append(\n f\"\\n... and {len(self.repo_cache) - 20:,} more files available\"\n )\n\n # Add detailed directory structure if requested\n if self.valves.show_detailed_file_tree and self.detailed_tree_cache:\n context_parts.append(f\"\\n{self.detailed_tree_cache}\")\n\n full_context = \"\\n\".join(context_parts)\n\n # Truncate if too long, but preserve structure\n if len(full_context) > self.valves.max_context_length:\n full_context = (\n full_context[: self.valves.max_context_length]\n + \"\\n\\n[CONTEXT TRUNCATED - USE FULL MODE FOR COMPLETE CONTENT]\"\n )\n\n return full_context\n\n def _build_full_context(self, user_valves) -> str:\n \"\"\"Build complete repository context with character-perfect reproduction\"\"\"\n if not self.repo_cache:\n return \"\"\n\n context_parts = []\n show_metadata = getattr(user_valves, \"show_file_metadata\", True)\n\n # Comprehensive header\n context_parts.append(\"🗂️ COMPLETE REPOSITORY CONTEXT (Full Mode)\")\n context_parts.append(\"═\" * 100)\n\n # Repository metadata\n if self.repo_metadata:\n context_parts.append(\"📊 REPOSITORY STATISTICS:\")\n context_parts.append(f\"Repository: {self.repo_metadata['repo_url']}\")\n context_parts.append(f\"Branch: {self.repo_metadata['branch']}\")\n context_parts.append(\n f\"Files Processed: {self.repo_metadata.get('total_files_processed', 0):,}\"\n )\n context_parts.append(\n f\"Files Included: {self.repo_metadata.get('total_files_included', 0):,}\"\n )\n context_parts.append(\n f\"Files Excluded: {self.repo_metadata.get('total_files_excluded', 0):,}\"\n )\n context_parts.append(\n f\"Total Size: {self.repo_metadata.get('total_bytes', 0):,} bytes\"\n )\n context_parts.append(\n f\"Total Lines: {self.repo_metadata.get('total_lines', 0):,}\"\n )\n context_parts.append(\n f\"Total Characters: {self.repo_metadata.get('total_characters', 0):,}\"\n )\n context_parts.append(\n f\"Total Chunks: {self.repo_metadata.get('total_chunks', 0):,}\"\n )\n context_parts.append(\n f\"Load Time: {self.repo_metadata.get('load_time_seconds', 0)} seconds\"\n )\n context_parts.append(\n f\"Processing Speed: {self.repo_metadata.get('files_per_second', 0)} files/sec\"\n )\n context_parts.append(\n f\"Last Updated: {self.repo_metadata.get('last_updated', 'Unknown')}\"\n )\n context_parts.append(\"\")\n\n # Detailed file summary table\n if show_metadata:\n context_parts.append(self._generate_file_summary_table())\n context_parts.append(\"\")\n\n # Detailed directory structure\n if self.valves.show_detailed_file_tree and self.detailed_tree_cache:\n context_parts.append(self.detailed_tree_cache)\n context_parts.append(\"\")\n\n # Complete file contents with precise metadata\n context_parts.append(\n \"📋 COMPLETE FILE CONTENTS (Character-Perfect Reproduction):\"\n )\n context_parts.append(\"═\" * 100)\n\n for file_path in sorted(self.repo_cache.keys()):\n file_data = self.repo_cache[file_path]\n analysis = file_data.get(\"analysis\", {})\n\n # File header with comprehensive metadata\n context_parts.append(f\"\\n{'█' * 80}\")\n context_parts.append(f\"📄 FILE: {file_path}\")\n context_parts.append(f\"{'█' * 80}\")\n\n if show_metadata:\n context_parts.append(\n f\"🔗 GitHub URL: {file_data.get('github_url', '')}\"\n )\n context_parts.append(f\"🔗 Raw URL: {file_data.get('raw_url', '')}\")\n context_parts.append(f\"📊 File Size: {file_data['size']:,} bytes\")\n context_parts.append(\n f\"📏 Character Count: {analysis.get('char_count', 0):,}\"\n )\n context_parts.append(\n f\"📄 Line Count: {analysis.get('line_count', 0):,}\"\n )\n context_parts.append(\n f\"📋 Non-Empty Lines: {analysis.get('non_empty_lines', 0):,}\"\n )\n context_parts.append(\n f\"⬜ Empty Lines: {analysis.get('empty_lines', 0):,}\"\n )\n context_parts.append(\n f\"📐 Max Line Length: {analysis.get('max_line_length', 0):,}\"\n )\n context_parts.append(\n f\"📊 Avg Line Length: {analysis.get('avg_line_length', 0)}\"\n )\n context_parts.append(f\"🎯 Chunks: {file_data.get('chunks', 0)}\")\n context_parts.append(\n f\"🏷️ Language: {analysis.get('language', 'Unknown')}\"\n )\n context_parts.append(\n f\"📎 Extension: {analysis.get('file_extension', 'none')}\"\n )\n context_parts.append(\n f\"🔤 Encoding: {analysis.get('estimated_encoding', 'utf-8')}\"\n )\n context_parts.append(f\"⭐ SHA: {file_data.get('sha', '')}\")\n context_parts.append(\n f\"🕒 Last Updated: {file_data.get('last_updated', '')}\"\n )\n\n # Language-specific metadata\n if \"import_lines\" in analysis:\n context_parts.append(\n f\"📦 Import Lines: {analysis['import_lines']:,}\"\n )\n if \"comment_lines\" in analysis:\n context_parts.append(\n f\"💬 Comment Lines: {analysis['comment_lines']:,}\"\n )\n if \"function_lines\" in analysis:\n context_parts.append(\n f\"⚡ Function Lines: {analysis['function_lines']:,}\"\n )\n if \"class_lines\" in analysis:\n context_parts.append(f\"🏗️ Class Lines: {analysis['class_lines']:,}\")\n\n context_parts.append(\n f\"🎨 Whitespace Ratio: {analysis.get('whitespace_ratio', 0)}%\"\n )\n context_parts.append(\n f\"📍 Indented Lines: {analysis.get('indented_lines', 0):,}\"\n )\n context_parts.append(f\"🔤 Tab Lines: {analysis.get('tab_lines', 0):,}\")\n context_parts.append(\n f\"🔸 Space Lines: {analysis.get('space_lines', 0):,}\"\n )\n\n context_parts.append(f\"{'─' * 80}\")\n context_parts.append(\"CONTENT START:\")\n context_parts.append(f\"{'─' * 80}\")\n\n # Character-perfect content reproduction\n context_parts.append(file_data[\"content\"])\n\n context_parts.append(f\"{'─' * 80}\")\n context_parts.append(f\"CONTENT END: {file_path}\")\n context_parts.append(f\"{'─' * 80}\")\n\n full_context = \"\\n\".join(context_parts)\n\n # Only truncate if absolutely necessary for full mode\n if len(full_context) > self.valves.max_context_length:\n truncate_point = self.valves.max_context_length - 500\n full_context = (\n full_context[:truncate_point]\n + f\"\\n\\n{'═' * 80}\\n[CONTEXT TRUNCATED AT {len(full_context):,} CHARACTERS]\\n[INCREASE max_context_length FOR COMPLETE REPRODUCTION]\\n[ORIGINAL FULL SIZE: {len(full_context):,} CHARACTERS]\\n{'═' * 80}\"\n )\n\n return full_context\n\n def _should_trigger_loading(self, messages: List[Dict], user_valves) -> bool:\n \"\"\"Determine if repository should be loaded based on user input\"\"\"\n if not messages or not self.valves.github_repo:\n return False\n\n # Get last user message\n user_messages = [msg for msg in messages if msg[\"role\"] == \"user\"]\n if not user_messages:\n return False\n\n last_message = user_messages[-1][\"content\"].lower()\n\n # Check for manual purge commands first\n purge_commands = [\n \"purge cache\",\n \"purge context\",\n \"clear cache\",\n \"clear context\",\n \"reload repo\",\n \"refresh repo\",\n ]\n if any(cmd in last_message for cmd in purge_commands):\n return True\n\n # Check user's preferred mode\n preferred_mode = getattr(user_valves, \"preferred_context_mode\", \"auto\")\n\n if preferred_mode == \"never\":\n return False\n elif preferred_mode == \"always\":\n return True\n elif preferred_mode == \"on-request\":\n # Only load if explicitly requested\n trigger_words = [\n \"load repo\",\n \"repository\",\n \"show files\",\n \"analyze code\",\n \"repo analysis\",\n ]\n return any(word in last_message for word in trigger_words)\n else: # auto mode\n # Check auto-trigger phrases\n trigger_phrases = getattr(user_valves, \"auto_trigger_phrases\", \"\").split(\n \",\"\n )\n trigger_phrases = [\n phrase.strip().lower() for phrase in trigger_phrases if phrase.strip()\n ]\n\n return any(phrase in last_message for phrase in trigger_phrases)\n\n def _determine_context_mode(self, messages: List[Dict]) -> str:\n \"\"\"Determine which context mode to use based on message content\"\"\"\n if not messages:\n return self.valves.context_mode\n\n # Get last user message\n user_messages = [msg for msg in messages if msg[\"role\"] == \"user\"]\n if not user_messages:\n return self.valves.context_mode\n\n last_message = user_messages[-1][\"content\"].lower()\n\n # Check for explicit mode requests\n if any(\n phrase in last_message\n for phrase in [\n \"full context\",\n \"complete repository\",\n \"all files\",\n \"entire codebase\",\n ]\n ):\n return \"full\"\n\n # Check for question patterns that benefit from search\n question_indicators = [\n \"how\",\n \"what\",\n \"where\",\n \"why\",\n \"when\",\n \"which\",\n \"find\",\n \"search\",\n \"locate\",\n \"?\",\n ]\n is_question = any(\n indicator in last_message for indicator in question_indicators\n )\n\n # Override context mode based on message content\n if self.valves.context_mode == \"smart\":\n if is_question or \"analyze\" in last_message or \"explain\" in last_message:\n return \"smart\" # Use semantic search\n else:\n return \"query-only\"\n\n return self.valves.context_mode\n\n def purge_cache(self, __event_emitter__=None):\n \"\"\"Purge all cached data with detailed feedback\"\"\"\n files_count = len(self.repo_cache)\n embeddings_count = len(self.embeddings_cache)\n\n self.repo_cache = {}\n self.embeddings_cache = {}\n self.file_tree_cache = \"\"\n self.detailed_tree_cache = \"\"\n self.repo_metadata = {}\n self.cache_timestamp = 0\n\n # Remove persistent cache files\n if self.valves.persistent_cache:\n try:\n cache_key = self._get_cache_key()\n cache_file = os.path.join(self.cache_dir, f\"{cache_key}.pkl\")\n if os.path.exists(cache_file):\n os.remove(cache_file)\n if self.valves.debug_mode:\n print(f\"✅ Removed persistent cache file: {cache_file}\")\n except Exception as e:\n if self.valves.debug_mode:\n print(f\"❌ Error removing cache file: {e}\")\n\n print(\n f\"🗑️ Repository cache purged: {files_count:,} files, {embeddings_count:,} embeddings\"\n )\n return f\"🗑️ Cache purged: {files_count:,} files and {embeddings_count:,} embeddings cleared\"\n\n def _is_cache_valid(self) -> bool:\n \"\"\"Check if cache is still valid\"\"\"\n if not self.repo_cache:\n return False\n return (time.time() - self.cache_timestamp) < self.valves.cache_duration\n\n async def inlet(\n self, body: dict, __user__: Optional[dict] = None, __event_emitter__=None\n ) -> dict:\n \"\"\"Process incoming request and inject GitHub repository context with precision controls\"\"\"\n\n # Get user valves\n user_valves = None\n if __user__ and \"valves\" in __user__:\n user_valves = __user__[\"valves\"]\n\n # Check if user has disabled GitHub context\n if user_valves and hasattr(user_valves, \"enable_github_context\"):\n if not user_valves.enable_github_context:\n return body\n\n # Check repository configuration\n if not self.valves.github_repo:\n if self.valves.debug_mode:\n print(\"❌ No GitHub repository configured\")\n return body\n\n messages = body.get(\"messages\", [])\n\n # Check for manual purge commands\n if messages and self.valves.enable_manual_purge:\n user_messages = [msg for msg in messages if msg[\"role\"] == \"user\"]\n if user_messages:\n last_message = user_messages[-1][\"content\"].lower()\n purge_commands = [\n \"purge cache\",\n \"purge context\",\n \"clear cache\",\n \"clear context\",\n ]\n\n if any(cmd in last_message for cmd in purge_commands):\n if __event_emitter__:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": \"🗑️ Purging repository cache...\",\n \"done\": False,\n },\n }\n )\n\n purge_result = self.purge_cache(__event_emitter__)\n\n if __event_emitter__:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"✅ {purge_result}\",\n \"done\": True,\n },\n }\n )\n\n # Force reload on next request\n return body\n\n # Determine if we should load the repository\n should_load = (\n self.valves.auto_load_on_startup\n or self._should_trigger_loading(messages, user_valves)\n or not self._is_cache_valid()\n )\n\n # Load repository if needed\n if should_load:\n if __event_emitter__:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": \"🔄 Repository loading initiated...\",\n \"done\": False,\n },\n }\n )\n\n success = await self.load_repository(__event_emitter__)\n if not success:\n if self.valves.debug_mode:\n print(\"❌ Failed to load repository\")\n return body\n\n # Skip if no cache available\n if not self.repo_cache:\n if self.valves.debug_mode:\n print(\"❌ No cached repository data available\")\n return body\n\n # Determine context mode\n context_mode = self._determine_context_mode(messages)\n\n # Build appropriate context based on mode\n if context_mode == \"full\":\n context = self._build_full_context(user_valves)\n elif context_mode in [\"smart\", \"query-only\"]:\n # Use last user message for search\n user_messages = [msg for msg in messages if msg[\"role\"] == \"user\"]\n query = user_messages[-1][\"content\"] if user_messages else \"\"\n context = self._build_context_from_search(query, user_valves)\n else:\n return body # No context injection\n\n if not context:\n return body\n\n # Add custom user prompt if provided\n custom_prompt = \"\"\n if user_valves and hasattr(user_valves, \"custom_system_prompt\"):\n custom_prompt = user_valves.custom_system_prompt\n\n if custom_prompt:\n context = f\"{custom_prompt}\\n\\n{context}\"\n\n # Remove any existing repository system messages to avoid duplicates\n messages = [\n msg\n for msg in messages\n if not (\n msg[\"role\"] == \"system\"\n and (\n \"🔍 REPOSITORY CONTEXT\" in msg.get(\"content\", \"\")\n or \"🗂️ COMPLETE REPOSITORY CONTEXT\" in msg.get(\"content\", \"\")\n )\n )\n ]\n\n # Create comprehensive system message\n system_message = {\"role\": \"system\", \"content\": context}\n\n # Insert at beginning\n messages.insert(0, system_message)\n body[\"messages\"] = messages\n\n # Debug logging\n if self.valves.debug_mode:\n print(\n f\"✅ Context injected: {len(context):,} characters in {context_mode} mode\"\n )\n print(\n f\"📊 Repository: {self.valves.github_repo} ({len(self.repo_cache):,} files)\"\n )\n print(\n f\"🧠 Embeddings: {'enabled' if self.valves.enable_semantic_search else 'disabled'} ({len(self.embeddings_cache):,} cached)\"\n )\n\n # Final status confirmation\n if __event_emitter__:\n files_count = len(self.repo_cache)\n total_size = sum(f[\"size\"] for f in self.repo_cache.values())\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": f\"✅ Repository context active: {files_count:,} files ({total_size:,} bytes) loaded in {context_mode} mode\",\n \"done\": True,\n \"hidden\": False,\n },\n }\n )\n\n return body\n\n async def outlet(\n self, body: dict, __user__: Optional[dict] = None, __event_emitter__=None\n ) -> dict:\n \"\"\"Process outgoing response with optional enhancements\"\"\"\n # Could add response processing, logging, or citations here\n return body\n"}]