import os from typing import List, Dict, Generator import mimetypes from pathlib import Path class FileProcessor: def __init__(self, supported_extensions: List[str], max_file_size: int): self.supported_extensions = supported_extensions self.max_file_size = max_file_size self.ignore_dirs = { '.git', '__pycache__', 'node_modules', '.pytest_cache', 'venv', 'env', '.venv', 'build', 'dist', '.next', 'coverage', '.coverage', 'logs', 'log' } self.ignore_files = { '.gitignore', '.env', '.env.local', '.DS_Store', 'package-lock.json', 'yarn.lock', 'poetry.lock' } def should_process_file(self, file_path: str) -> bool: """Check if file should be processed""" path = Path(file_path) # Check if any parent directory is in ignore list for parent in path.parents: if parent.name in self.ignore_dirs: return False # Check file name if path.name in self.ignore_files: return False # Check extension if path.suffix.lower() not in self.supported_extensions: return False # Check file size try: if os.path.getsize(file_path) > self.max_file_size: return False except OSError: return False return True def extract_files(self, repo_path: str) -> Generator[Dict, None, None]: """Extract and yield file information""" for root, dirs, files in os.walk(repo_path): # Filter out ignored directories dirs[:] = [d for d in dirs if d not in self.ignore_dirs] for file in files: file_path = os.path.join(root, file) relative_path = os.path.relpath(file_path, repo_path) if not self.should_process_file(file_path): continue try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() yield { 'path': relative_path, 'content': content, 'extension': Path(file_path).suffix.lower(), 'size': len(content) } except Exception as e: print(f"Error reading file {relative_path}: {e}") continue