Spaces:

jmzlx
/

dd-poc

Sleeping

File size: 13,631 Bytes

9a71b8f

#!/usr/bin/env python3
"""
Upload script for dd-framework: Due Diligence Methodology and Templates
This uploads the core framework components: checklists, questions, and strategy docs
"""

import os
import json
import shutil
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder
from typing import Dict, List, Tuple

def analyze_framework_components() -> Dict:
    """Analyze the framework components and gather statistics"""
    base_path = Path("data")
    
    components = {
        "checklists": [],
        "questions": [], 
        "strategy": [],
        "total_files": 0,
        "total_lines": 0
    }
    
    # Analyze checklists
    checklist_path = base_path / "checklist"
    if checklist_path.exists():
        for file_path in checklist_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["checklists"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    # Analyze questions
    questions_path = base_path / "questions"
    if questions_path.exists():
        for file_path in questions_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["questions"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    # Analyze strategy docs
    strategy_path = base_path / "strategy"
    if strategy_path.exists():
        for file_path in strategy_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["strategy"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    return components

def create_framework_readme(repo_id: str, components: Dict) -> str:
    """Create comprehensive README for dd-framework repository"""
    
    # Calculate total size
    total_size_kb = sum([
        sum(item["size_kb"] for item in components["checklists"]),
        sum(item["size_kb"] for item in components["questions"]),
        sum(item["size_kb"] for item in components["strategy"])
    ])
    
    checklist_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
        for item in components["checklists"]
    ])
    
    questions_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" 
        for item in components["questions"]
    ])
    
    strategy_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
        for item in components["strategy"]
    ])

    return f"""---
language:
- en
license: mit
task_categories:
- question-answering
- document-question-answering
- text-classification
tags:
- due-diligence
- legal-framework
- financial-analysis
- m&a
- checklists
- methodology
size_categories:
- n<1K
---

# 📋 Due Diligence Framework

**Core methodology, checklists, and templates for AI-powered due diligence analysis**

This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies.

## 🎯 What's Included

### 📑 **Due Diligence Checklists** ({len(components["checklists"])} files)
Comprehensive checklists covering all aspects of M&A due diligence:

{checklist_details}

**Coverage Areas:**
- Organizational & Corporate Documents
- Financial & Accounting Records  
- Legal Matters & Litigation
- Intellectual Property
- Employment & HR
- Operations & Commercial
- Technology & IT Systems
- Environmental & Regulatory

### ❓ **Question Templates** ({len(components["questions"])} files)
Structured question sets for systematic analysis:

{questions_details}

**Question Categories:**
- Corporate Structure & Governance
- Financial Performance & Accounting
- Legal & Compliance Matters
- Business Operations & Strategy
- Risk Assessment & Management

### 🎯 **Strategic Analysis Framework** ({len(components["strategy"])} files)
Real-world strategic analysis methodologies:

{strategy_details}

**Strategic Components:**
- M&A Target Assessment
- Market Positioning Analysis
- Technology Stack Evaluation
- Risk-Opportunity Matrix

## 📊 **Dataset Statistics**

- **Total Files**: {components["total_files"]}
- **Total Lines**: {components["total_lines"]:,}
- **Total Size**: {total_size_kb:.1f}KB
- **Format**: Markdown (.md)
- **Language**: English

## 🚀 **Quick Start**

### Load Individual Components

```python
from huggingface_hub import hf_hub_download

# Download Bloomberg checklist
bloomberg_checklist = hf_hub_download(
    repo_id="{repo_id}",
    filename="data/checklist/bloomberg.md"
)

# Download question templates  
questions = hf_hub_download(
    repo_id="{repo_id}",
    filename="data/questions/due diligence.md"
)

# Download strategy framework
strategy = hf_hub_download(
    repo_id="{repo_id}", 
    filename="data/strategy/rockman.md"
)
```

### Clone Entire Framework

```bash
git clone https://huggingface.co/datasets/{repo_id}
cd dd-framework
```

### Use with AI Systems

```python
# Example: Load checklist for RAG system
with open("data/checklist/bloomberg.md", "r") as f:
    checklist_content = f.read()

# Parse checklist items
checklist_items = parse_checklist_items(checklist_content)

# Use for document matching, Q&A, etc.
relevant_docs = match_documents_to_checklist(checklist_items, document_corpus)
```

## 🔗 **Related Datasets**

This framework is part of a complete due diligence toolkit:

- 📋 **[dd-framework](../dd-framework)** - Methodology and templates *(this repo)*
- ⚡ **[dd-indexes](../dd-indexes)** - Pre-computed search indexes
- 📁 **[dd-vdrs](../dd-vdrs)** - Virtual data room documents

## 🎨 **Use Cases**

### For Researchers
- **Legal NLP**: Train models on structured legal/financial templates
- **Question Generation**: Use templates for synthetic Q&A dataset creation
- **Document Classification**: Use checklists as taxonomy for document labeling

### For Developers
- **RAG Systems**: Use as knowledge base for due diligence chatbots
- **Checklist Matching**: Build automated document-to-requirement matching
- **Template Engine**: Generate custom checklists for different industries

### For Practitioners
- **Due Diligence Planning**: Ready-to-use checklists and question sets
- **Process Standardization**: Consistent methodology across engagements
- **Quality Assurance**: Comprehensive coverage verification

## 📈 **Framework Structure**

```
data/
├── checklist/
│   ├── bloomberg.md          # Bloomberg-style comprehensive checklist
│   └── original.md           # Traditional M&A checklist format
├── questions/  
│   ├── due diligence.md      # Core question templates
│   └── expanded.md           # Extended question variations
└── strategy/
    ├── rockman.md            # Strategic analysis methodology
    └── rockman - alternative.md  # Alternative approach
```

## 🏷️ **Methodology**

The framework follows established due diligence best practices:

1. **Comprehensive Coverage**: All critical business areas included
2. **Structured Format**: Consistent markdown formatting for easy parsing
3. **AI-Ready**: Optimized for integration with LLMs and RAG systems
4. **Industry-Standard**: Based on real-world M&A and investment practices
5. **Modular Design**: Components can be used independently or together

## ⚖️ **Legal & Usage**

- **License**: MIT - Free for commercial and research use
- **Content**: Methodology and templates, no confidential data
- **Attribution**: Citation appreciated but not required

## 📖 **Citation**

If you use this framework in your research:

```bibtex
@dataset{{dd_framework_2024,
  title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}},
  author={{AI Due Diligence Project}},
  year={{2024}},
  publisher={{Hugging Face}},
  url={{https://huggingface.co/datasets/{repo_id}}}
}}
```

## 📧 **Contact**

Questions or suggestions? Open an issue or reach out!

---

*Part of the AI Due Diligence project - Making systematic business analysis accessible through AI*
"""

def prepare_framework_upload() -> Path:
    """Prepare upload directory with framework components only"""
    upload_dir = Path("hf_framework_upload")
    
    # Clean and create upload directory
    if upload_dir.exists():
        shutil.rmtree(upload_dir)
    upload_dir.mkdir()
    
    # Create data directory structure
    data_dst = upload_dir / "data"
    data_dst.mkdir()
    
    # Copy framework components
    components_to_copy = [
        ("data/checklist", "checklist"),
        ("data/questions", "questions"), 
        ("data/strategy", "strategy")
    ]
    
    for src_dir, dst_dir in components_to_copy:
        src_path = Path(src_dir)
        dst_path = data_dst / dst_dir
        
        if src_path.exists():
            shutil.copytree(src_path, dst_path)
            print(f"✅ Copied {src_dir} -> {dst_path}")
        else:
            print(f"⚠️  Skipped {src_dir} (not found)")
    
    return upload_dir

def upload_framework(repo_id: str, token: str = None):
    """Upload dd-framework to Hugging Face Hub"""
    
    print("🚀 Starting dd-framework upload...")
    
    # Initialize HF API
    api = HfApi(token=token)
    
    # Create repository
    try:
        create_repo(
            repo_id=repo_id,
            repo_type="dataset", 
            token=token,
            exist_ok=True,
            private=False
        )
        print(f"✅ Created/verified repository: {repo_id}")
    except Exception as e:
        print(f"❌ Error creating repository: {e}")
        return False
    
    # Analyze framework components
    print("📊 Analyzing framework components...")
    components = analyze_framework_components()
    print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines")
    
    # Prepare upload directory
    print("📁 Preparing framework files...")
    upload_dir = prepare_framework_upload()
    
    # Create README
    print("📝 Creating dataset card...")
    readme_content = create_framework_readme(repo_id, components)
    (upload_dir / "README.md").write_text(readme_content)
    
    # Create metadata file
    metadata = {
        "repository": "dd-framework",
        "description": "Due diligence methodology and templates", 
        "components": components,
        "upload_date": datetime.now().isoformat(),
        "version": "1.0.0",
        "related_repositories": [
            "dd-indexes",
            "dd-vdrs"  
        ]
    }
    (upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2))
    
    # Upload
    try:
        print(f"🚀 Uploading to {repo_id}...")
        upload_folder(
            folder_path=upload_dir,
            repo_id=repo_id,
            repo_type="dataset",
            token=token,
            commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology"
        )
        print(f"✅ Successfully uploaded to https://huggingface.co/datasets/{repo_id}")
        print(f"📊 Uploaded {components['total_files']} files, {components['total_lines']:,} lines")
        return True
        
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False
        
    finally:
        # Cleanup
        if upload_dir.exists():
            shutil.rmtree(upload_dir)
            print("🧹 Cleaned up temporary files")

def main():
    """Main execution function"""
    
    # Configuration
    REPO_ID = "jmzlx/dd-framework"
    HF_TOKEN = os.getenv("HF_TOKEN")
    
    print("🔧 DD-Framework Upload Configuration")
    print(f"Repository: {REPO_ID}")
    print(f"Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
    print()
    
    if not HF_TOKEN:
        print("❌ Please set your HF_TOKEN environment variable")
        print("1. Go to https://huggingface.co/settings/tokens")
        print("2. Create a token with 'write' permissions") 
        print("3. Run: export HF_TOKEN='your_token_here'")
        return
    
    if REPO_ID == "your-username/dd-framework":
        print("❌ Please update REPO_ID with your actual username!")
        print("Edit this script and change the REPO_ID variable")
        return
    
    # Run upload
    success = upload_framework(REPO_ID, HF_TOKEN)
    
    if success:
        print("\n🎉 Upload completed successfully!")
        print(f"🔗 View your dataset: https://huggingface.co/datasets/{REPO_ID}")
        print(f"📋 Next steps:")
        print(f"   - Review the dataset card")
        print(f"   - Test downloading components") 
        print(f"   - Share with the community!")
    else:
        print("\n💥 Upload failed - check error messages above")

if __name__ == "__main__":
    main()