Spaces:
Sleeping
Sleeping
| """ | |
| Utility functions for the Iain Morris article generator project | |
| """ | |
| import json | |
| import os | |
| import logging | |
| from typing import Dict, List, Optional | |
| import requests | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| def setup_logging(log_level: str = "INFO"): | |
| """ | |
| Setup logging configuration | |
| Args: | |
| log_level: Logging level (DEBUG, INFO, WARNING, ERROR) | |
| """ | |
| logging.basicConfig( | |
| level=getattr(logging, log_level.upper()), | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler('morris_bot.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| def ensure_directories(): | |
| """Ensure all required directories exist""" | |
| directories = [ | |
| 'data', | |
| 'models', | |
| 'models/lora_adapters', | |
| 'logs' | |
| ] | |
| for directory in directories: | |
| os.makedirs(directory, exist_ok=True) | |
| logger.info(f"Ensured directory exists: {directory}") | |
| def load_json(filepath: str) -> Optional[Dict]: | |
| """ | |
| Load JSON file safely | |
| Args: | |
| filepath: Path to JSON file | |
| Returns: | |
| Loaded data or None if failed | |
| """ | |
| try: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| except Exception as e: | |
| logger.error(f"Error loading JSON from {filepath}: {e}") | |
| return None | |
| def save_json(data: Dict, filepath: str): | |
| """ | |
| Save data to JSON file | |
| Args: | |
| data: Data to save | |
| filepath: Output file path | |
| """ | |
| try: | |
| os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| logger.info(f"Saved data to {filepath}") | |
| except Exception as e: | |
| logger.error(f"Error saving JSON to {filepath}: {e}") | |
| def validate_articles(articles: List[Dict]) -> List[Dict]: | |
| """ | |
| Validate article data structure | |
| Args: | |
| articles: List of article dictionaries | |
| Returns: | |
| List of valid articles | |
| """ | |
| valid_articles = [] | |
| required_fields = ['title', 'content', 'author', 'url'] | |
| for i, article in enumerate(articles): | |
| if all(field in article and article[field] for field in required_fields): | |
| valid_articles.append(article) | |
| else: | |
| logger.warning(f"Article {i} missing required fields: {article.get('title', 'Unknown')}") | |
| logger.info(f"Validated {len(valid_articles)} out of {len(articles)} articles") | |
| return valid_articles | |
| def get_model_info(): | |
| """Get information about available models""" | |
| model_info = { | |
| "base_models": { | |
| "mistralai/Mistral-7B-Instruct-v0.1": { | |
| "description": "High-quality 7B parameter model, excellent for fine-tuning", | |
| "memory_requirement": "~14GB GPU memory with 4-bit quantization", | |
| "recommended": True | |
| }, | |
| "meta-llama/Llama-2-7b-chat-hf": { | |
| "description": "Popular 7B chat model, good performance", | |
| "memory_requirement": "~14GB GPU memory with 4-bit quantization", | |
| "recommended": True | |
| }, | |
| "microsoft/DialoGPT-medium": { | |
| "description": "Smaller model, faster training but lower quality", | |
| "memory_requirement": "~4GB GPU memory", | |
| "recommended": False | |
| } | |
| }, | |
| "training_requirements": { | |
| "minimum_gpu_memory": "8GB", | |
| "recommended_gpu_memory": "16GB+", | |
| "training_time_estimate": "4-6 hours on RTX 3080", | |
| "cpu_training": "Possible but very slow (24+ hours)" | |
| } | |
| } | |
| return model_info | |
| def check_system_requirements(): | |
| """Check if system meets requirements for training""" | |
| requirements = { | |
| "python_version": True, | |
| "torch_available": False, | |
| "cuda_available": False, | |
| "gpu_memory": 0, | |
| "disk_space": True | |
| } | |
| try: | |
| import torch | |
| requirements["torch_available"] = True | |
| if torch.cuda.is_available(): | |
| requirements["cuda_available"] = True | |
| requirements["gpu_memory"] = torch.cuda.get_device_properties(0).total_memory / 1e9 | |
| except ImportError: | |
| pass | |
| return requirements | |
| def estimate_training_time(num_articles: int, gpu_memory: float) -> str: | |
| """ | |
| Estimate training time based on dataset size and hardware | |
| Args: | |
| num_articles: Number of training articles | |
| gpu_memory: GPU memory in GB | |
| Returns: | |
| Estimated training time string | |
| """ | |
| if gpu_memory >= 16: | |
| base_time = 0.5 # minutes per article | |
| elif gpu_memory >= 8: | |
| base_time = 1.0 | |
| else: | |
| base_time = 5.0 # CPU training | |
| total_minutes = num_articles * base_time * 3 # 3 epochs | |
| if total_minutes < 60: | |
| return f"~{int(total_minutes)} minutes" | |
| else: | |
| hours = total_minutes / 60 | |
| return f"~{hours:.1f} hours" | |
| def create_project_summary() -> Dict: | |
| """Create a summary of the project status""" | |
| summary = { | |
| "timestamp": datetime.now().isoformat(), | |
| "files_created": [], | |
| "data_status": {}, | |
| "model_status": {}, | |
| "next_steps": [] | |
| } | |
| # Check which files exist | |
| files_to_check = [ | |
| "requirements.txt", | |
| "app.py", | |
| "src/scraper.py", | |
| "src/preprocess.py", | |
| "src/finetune.py", | |
| "src/utils.py" | |
| ] | |
| for file_path in files_to_check: | |
| if os.path.exists(file_path): | |
| summary["files_created"].append(file_path) | |
| # Check data status | |
| if os.path.exists("data/raw_articles.json"): | |
| articles = load_json("data/raw_articles.json") | |
| if articles: | |
| summary["data_status"]["raw_articles"] = len(articles) | |
| if os.path.exists("data/train_dataset.json"): | |
| train_data = load_json("data/train_dataset.json") | |
| if train_data: | |
| summary["data_status"]["training_examples"] = len(train_data) | |
| # Check model status | |
| if os.path.exists("models/lora_adapters"): | |
| summary["model_status"]["lora_adapters"] = "Available" | |
| else: | |
| summary["model_status"]["lora_adapters"] = "Not trained" | |
| # Determine next steps | |
| if not summary["data_status"]: | |
| summary["next_steps"].append("1. Run scraper to collect articles") | |
| summary["next_steps"].append("2. Run preprocessing to prepare training data") | |
| summary["next_steps"].append("3. Run fine-tuning to train the model") | |
| summary["next_steps"].append("4. Launch the Gradio app") | |
| elif "training_examples" not in summary["data_status"]: | |
| summary["next_steps"].append("1. Run preprocessing to prepare training data") | |
| summary["next_steps"].append("2. Run fine-tuning to train the model") | |
| summary["next_steps"].append("3. Launch the Gradio app") | |
| elif summary["model_status"]["lora_adapters"] == "Not trained": | |
| summary["next_steps"].append("1. Run fine-tuning to train the model") | |
| summary["next_steps"].append("2. Launch the Gradio app") | |
| else: | |
| summary["next_steps"].append("1. Launch the Gradio app") | |
| summary["next_steps"].append("2. Test article generation") | |
| return summary | |
| def print_project_status(): | |
| """Print current project status""" | |
| summary = create_project_summary() | |
| print("\n" + "="*60) | |
| print("π€ IAIN MORRIS ARTICLE GENERATOR - PROJECT STATUS") | |
| print("="*60) | |
| print(f"\nπ Last Updated: {summary['timestamp']}") | |
| print(f"\nπ Files Created ({len(summary['files_created'])}):") | |
| for file_path in summary['files_created']: | |
| print(f" β {file_path}") | |
| print(f"\nπ Data Status:") | |
| if summary['data_status']: | |
| for key, value in summary['data_status'].items(): | |
| print(f" π {key}: {value}") | |
| else: | |
| print(" β No data collected yet") | |
| print(f"\nπ€ Model Status:") | |
| for key, value in summary['model_status'].items(): | |
| status_icon = "β " if value == "Available" else "β" | |
| print(f" {status_icon} {key}: {value}") | |
| print(f"\nπ― Next Steps:") | |
| for step in summary['next_steps']: | |
| print(f" {step}") | |
| print("\n" + "="*60) | |
| if __name__ == "__main__": | |
| setup_logging() | |
| ensure_directories() | |
| print_project_status() | |