#!/usr/bin/env python3 """ Upload script for dd-framework: Due Diligence Methodology and Templates This uploads the core framework components: checklists, questions, and strategy docs """ import os import json import shutil from pathlib import Path from datetime import datetime from huggingface_hub import HfApi, create_repo, upload_folder from typing import Dict, List, Tuple def analyze_framework_components() -> Dict: """Analyze the framework components and gather statistics""" base_path = Path("data") components = { "checklists": [], "questions": [], "strategy": [], "total_files": 0, "total_lines": 0 } # Analyze checklists checklist_path = base_path / "checklist" if checklist_path.exists(): for file_path in checklist_path.glob("*.md"): lines = len(file_path.read_text().splitlines()) components["checklists"].append({ "name": file_path.name, "path": str(file_path.relative_to(Path("."))), "lines": lines, "size_kb": round(file_path.stat().st_size / 1024, 1) }) components["total_lines"] += lines components["total_files"] += 1 # Analyze questions questions_path = base_path / "questions" if questions_path.exists(): for file_path in questions_path.glob("*.md"): lines = len(file_path.read_text().splitlines()) components["questions"].append({ "name": file_path.name, "path": str(file_path.relative_to(Path("."))), "lines": lines, "size_kb": round(file_path.stat().st_size / 1024, 1) }) components["total_lines"] += lines components["total_files"] += 1 # Analyze strategy docs strategy_path = base_path / "strategy" if strategy_path.exists(): for file_path in strategy_path.glob("*.md"): lines = len(file_path.read_text().splitlines()) components["strategy"].append({ "name": file_path.name, "path": str(file_path.relative_to(Path("."))), "lines": lines, "size_kb": round(file_path.stat().st_size / 1024, 1) }) components["total_lines"] += lines components["total_files"] += 1 return components def create_framework_readme(repo_id: str, components: Dict) -> str: """Create comprehensive README for dd-framework repository""" # Calculate total size total_size_kb = sum([ sum(item["size_kb"] for item in components["checklists"]), sum(item["size_kb"] for item in components["questions"]), sum(item["size_kb"] for item in components["strategy"]) ]) checklist_details = "\n".join([ f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" for item in components["checklists"] ]) questions_details = "\n".join([ f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" for item in components["questions"] ]) strategy_details = "\n".join([ f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" for item in components["strategy"] ]) return f"""--- language: - en license: mit task_categories: - question-answering - document-question-answering - text-classification tags: - due-diligence - legal-framework - financial-analysis - m&a - checklists - methodology size_categories: - n<1K --- # ๐Ÿ“‹ Due Diligence Framework **Core methodology, checklists, and templates for AI-powered due diligence analysis** This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies. ## ๐ŸŽฏ What's Included ### ๐Ÿ“‘ **Due Diligence Checklists** ({len(components["checklists"])} files) Comprehensive checklists covering all aspects of M&A due diligence: {checklist_details} **Coverage Areas:** - Organizational & Corporate Documents - Financial & Accounting Records - Legal Matters & Litigation - Intellectual Property - Employment & HR - Operations & Commercial - Technology & IT Systems - Environmental & Regulatory ### โ“ **Question Templates** ({len(components["questions"])} files) Structured question sets for systematic analysis: {questions_details} **Question Categories:** - Corporate Structure & Governance - Financial Performance & Accounting - Legal & Compliance Matters - Business Operations & Strategy - Risk Assessment & Management ### ๐ŸŽฏ **Strategic Analysis Framework** ({len(components["strategy"])} files) Real-world strategic analysis methodologies: {strategy_details} **Strategic Components:** - M&A Target Assessment - Market Positioning Analysis - Technology Stack Evaluation - Risk-Opportunity Matrix ## ๐Ÿ“Š **Dataset Statistics** - **Total Files**: {components["total_files"]} - **Total Lines**: {components["total_lines"]:,} - **Total Size**: {total_size_kb:.1f}KB - **Format**: Markdown (.md) - **Language**: English ## ๐Ÿš€ **Quick Start** ### Load Individual Components ```python from huggingface_hub import hf_hub_download # Download Bloomberg checklist bloomberg_checklist = hf_hub_download( repo_id="{repo_id}", filename="data/checklist/bloomberg.md" ) # Download question templates questions = hf_hub_download( repo_id="{repo_id}", filename="data/questions/due diligence.md" ) # Download strategy framework strategy = hf_hub_download( repo_id="{repo_id}", filename="data/strategy/rockman.md" ) ``` ### Clone Entire Framework ```bash git clone https://huggingface.co/datasets/{repo_id} cd dd-framework ``` ### Use with AI Systems ```python # Example: Load checklist for RAG system with open("data/checklist/bloomberg.md", "r") as f: checklist_content = f.read() # Parse checklist items checklist_items = parse_checklist_items(checklist_content) # Use for document matching, Q&A, etc. relevant_docs = match_documents_to_checklist(checklist_items, document_corpus) ``` ## ๐Ÿ”— **Related Datasets** This framework is part of a complete due diligence toolkit: - ๐Ÿ“‹ **[dd-framework](../dd-framework)** - Methodology and templates *(this repo)* - โšก **[dd-indexes](../dd-indexes)** - Pre-computed search indexes - ๐Ÿ“ **[dd-vdrs](../dd-vdrs)** - Virtual data room documents ## ๐ŸŽจ **Use Cases** ### For Researchers - **Legal NLP**: Train models on structured legal/financial templates - **Question Generation**: Use templates for synthetic Q&A dataset creation - **Document Classification**: Use checklists as taxonomy for document labeling ### For Developers - **RAG Systems**: Use as knowledge base for due diligence chatbots - **Checklist Matching**: Build automated document-to-requirement matching - **Template Engine**: Generate custom checklists for different industries ### For Practitioners - **Due Diligence Planning**: Ready-to-use checklists and question sets - **Process Standardization**: Consistent methodology across engagements - **Quality Assurance**: Comprehensive coverage verification ## ๐Ÿ“ˆ **Framework Structure** ``` data/ โ”œโ”€โ”€ checklist/ โ”‚ โ”œโ”€โ”€ bloomberg.md # Bloomberg-style comprehensive checklist โ”‚ โ””โ”€โ”€ original.md # Traditional M&A checklist format โ”œโ”€โ”€ questions/ โ”‚ โ”œโ”€โ”€ due diligence.md # Core question templates โ”‚ โ””โ”€โ”€ expanded.md # Extended question variations โ””โ”€โ”€ strategy/ โ”œโ”€โ”€ rockman.md # Strategic analysis methodology โ””โ”€โ”€ rockman - alternative.md # Alternative approach ``` ## ๐Ÿท๏ธ **Methodology** The framework follows established due diligence best practices: 1. **Comprehensive Coverage**: All critical business areas included 2. **Structured Format**: Consistent markdown formatting for easy parsing 3. **AI-Ready**: Optimized for integration with LLMs and RAG systems 4. **Industry-Standard**: Based on real-world M&A and investment practices 5. **Modular Design**: Components can be used independently or together ## โš–๏ธ **Legal & Usage** - **License**: MIT - Free for commercial and research use - **Content**: Methodology and templates, no confidential data - **Attribution**: Citation appreciated but not required ## ๐Ÿ“– **Citation** If you use this framework in your research: ```bibtex @dataset{{dd_framework_2024, title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}}, author={{AI Due Diligence Project}}, year={{2024}}, publisher={{Hugging Face}}, url={{https://huggingface.co/datasets/{repo_id}}} }} ``` ## ๐Ÿ“ง **Contact** Questions or suggestions? Open an issue or reach out! --- *Part of the AI Due Diligence project - Making systematic business analysis accessible through AI* """ def prepare_framework_upload() -> Path: """Prepare upload directory with framework components only""" upload_dir = Path("hf_framework_upload") # Clean and create upload directory if upload_dir.exists(): shutil.rmtree(upload_dir) upload_dir.mkdir() # Create data directory structure data_dst = upload_dir / "data" data_dst.mkdir() # Copy framework components components_to_copy = [ ("data/checklist", "checklist"), ("data/questions", "questions"), ("data/strategy", "strategy") ] for src_dir, dst_dir in components_to_copy: src_path = Path(src_dir) dst_path = data_dst / dst_dir if src_path.exists(): shutil.copytree(src_path, dst_path) print(f"โœ… Copied {src_dir} -> {dst_path}") else: print(f"โš ๏ธ Skipped {src_dir} (not found)") return upload_dir def upload_framework(repo_id: str, token: str = None): """Upload dd-framework to Hugging Face Hub""" print("๐Ÿš€ Starting dd-framework upload...") # Initialize HF API api = HfApi(token=token) # Create repository try: create_repo( repo_id=repo_id, repo_type="dataset", token=token, exist_ok=True, private=False ) print(f"โœ… Created/verified repository: {repo_id}") except Exception as e: print(f"โŒ Error creating repository: {e}") return False # Analyze framework components print("๐Ÿ“Š Analyzing framework components...") components = analyze_framework_components() print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines") # Prepare upload directory print("๐Ÿ“ Preparing framework files...") upload_dir = prepare_framework_upload() # Create README print("๐Ÿ“ Creating dataset card...") readme_content = create_framework_readme(repo_id, components) (upload_dir / "README.md").write_text(readme_content) # Create metadata file metadata = { "repository": "dd-framework", "description": "Due diligence methodology and templates", "components": components, "upload_date": datetime.now().isoformat(), "version": "1.0.0", "related_repositories": [ "dd-indexes", "dd-vdrs" ] } (upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2)) # Upload try: print(f"๐Ÿš€ Uploading to {repo_id}...") upload_folder( folder_path=upload_dir, repo_id=repo_id, repo_type="dataset", token=token, commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology" ) print(f"โœ… Successfully uploaded to https://huggingface.co/datasets/{repo_id}") print(f"๐Ÿ“Š Uploaded {components['total_files']} files, {components['total_lines']:,} lines") return True except Exception as e: print(f"โŒ Upload failed: {e}") return False finally: # Cleanup if upload_dir.exists(): shutil.rmtree(upload_dir) print("๐Ÿงน Cleaned up temporary files") def main(): """Main execution function""" # Configuration REPO_ID = "jmzlx/dd-framework" HF_TOKEN = os.getenv("HF_TOKEN") print("๐Ÿ”ง DD-Framework Upload Configuration") print(f"Repository: {REPO_ID}") print(f"Token: {'โœ… Set' if HF_TOKEN else 'โŒ Missing'}") print() if not HF_TOKEN: print("โŒ Please set your HF_TOKEN environment variable") print("1. Go to https://huggingface.co/settings/tokens") print("2. Create a token with 'write' permissions") print("3. Run: export HF_TOKEN='your_token_here'") return if REPO_ID == "your-username/dd-framework": print("โŒ Please update REPO_ID with your actual username!") print("Edit this script and change the REPO_ID variable") return # Run upload success = upload_framework(REPO_ID, HF_TOKEN) if success: print("\n๐ŸŽ‰ Upload completed successfully!") print(f"๐Ÿ”— View your dataset: https://huggingface.co/datasets/{REPO_ID}") print(f"๐Ÿ“‹ Next steps:") print(f" - Review the dataset card") print(f" - Test downloading components") print(f" - Share with the community!") else: print("\n๐Ÿ’ฅ Upload failed - check error messages above") if __name__ == "__main__": main()