git-chat / utils /file_processor.py
lakkiroy's picture
Upload folder using huggingface_hub
200bf6d verified
import os
from typing import List, Dict, Generator
import mimetypes
from pathlib import Path
class FileProcessor:
def __init__(self, supported_extensions: List[str], max_file_size: int):
self.supported_extensions = supported_extensions
self.max_file_size = max_file_size
self.ignore_dirs = {
'.git', '__pycache__', 'node_modules', '.pytest_cache',
'venv', 'env', '.venv', 'build', 'dist', '.next',
'coverage', '.coverage', 'logs', 'log'
}
self.ignore_files = {
'.gitignore', '.env', '.env.local', '.DS_Store',
'package-lock.json', 'yarn.lock', 'poetry.lock'
}
def should_process_file(self, file_path: str) -> bool:
"""Check if file should be processed"""
path = Path(file_path)
# Check if any parent directory is in ignore list
for parent in path.parents:
if parent.name in self.ignore_dirs:
return False
# Check file name
if path.name in self.ignore_files:
return False
# Check extension
if path.suffix.lower() not in self.supported_extensions:
return False
# Check file size
try:
if os.path.getsize(file_path) > self.max_file_size:
return False
except OSError:
return False
return True
def extract_files(self, repo_path: str) -> Generator[Dict, None, None]:
"""Extract and yield file information"""
for root, dirs, files in os.walk(repo_path):
# Filter out ignored directories
dirs[:] = [d for d in dirs if d not in self.ignore_dirs]
for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, repo_path)
if not self.should_process_file(file_path):
continue
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
yield {
'path': relative_path,
'content': content,
'extension': Path(file_path).suffix.lower(),
'size': len(content)
}
except Exception as e:
print(f"Error reading file {relative_path}: {e}")
continue