| import os |
| from typing import List, Dict, Generator |
| import mimetypes |
| from pathlib import Path |
|
|
| class FileProcessor: |
| def __init__(self, supported_extensions: List[str], max_file_size: int): |
| self.supported_extensions = supported_extensions |
| self.max_file_size = max_file_size |
| self.ignore_dirs = { |
| '.git', '__pycache__', 'node_modules', '.pytest_cache', |
| 'venv', 'env', '.venv', 'build', 'dist', '.next', |
| 'coverage', '.coverage', 'logs', 'log' |
| } |
| self.ignore_files = { |
| '.gitignore', '.env', '.env.local', '.DS_Store', |
| 'package-lock.json', 'yarn.lock', 'poetry.lock' |
| } |
| |
| def should_process_file(self, file_path: str) -> bool: |
| """Check if file should be processed""" |
| path = Path(file_path) |
| |
| |
| for parent in path.parents: |
| if parent.name in self.ignore_dirs: |
| return False |
| |
| |
| if path.name in self.ignore_files: |
| return False |
| |
| |
| if path.suffix.lower() not in self.supported_extensions: |
| return False |
| |
| |
| try: |
| if os.path.getsize(file_path) > self.max_file_size: |
| return False |
| except OSError: |
| return False |
| |
| return True |
| |
| def extract_files(self, repo_path: str) -> Generator[Dict, None, None]: |
| """Extract and yield file information""" |
| for root, dirs, files in os.walk(repo_path): |
| |
| dirs[:] = [d for d in dirs if d not in self.ignore_dirs] |
| |
| for file in files: |
| file_path = os.path.join(root, file) |
| relative_path = os.path.relpath(file_path, repo_path) |
| |
| if not self.should_process_file(file_path): |
| continue |
| |
| try: |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
| content = f.read() |
| |
| yield { |
| 'path': relative_path, |
| 'content': content, |
| 'extension': Path(file_path).suffix.lower(), |
| 'size': len(content) |
| } |
| except Exception as e: |
| print(f"Error reading file {relative_path}: {e}") |
| continue |