dd-poc / scripts /upload_dd_framework.py
Juan Salas
πŸš€ Refactor: Move datasets to HuggingFace repos
9a71b8f
#!/usr/bin/env python3
"""
Upload script for dd-framework: Due Diligence Methodology and Templates
This uploads the core framework components: checklists, questions, and strategy docs
"""
import os
import json
import shutil
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder
from typing import Dict, List, Tuple
def analyze_framework_components() -> Dict:
"""Analyze the framework components and gather statistics"""
base_path = Path("data")
components = {
"checklists": [],
"questions": [],
"strategy": [],
"total_files": 0,
"total_lines": 0
}
# Analyze checklists
checklist_path = base_path / "checklist"
if checklist_path.exists():
for file_path in checklist_path.glob("*.md"):
lines = len(file_path.read_text().splitlines())
components["checklists"].append({
"name": file_path.name,
"path": str(file_path.relative_to(Path("."))),
"lines": lines,
"size_kb": round(file_path.stat().st_size / 1024, 1)
})
components["total_lines"] += lines
components["total_files"] += 1
# Analyze questions
questions_path = base_path / "questions"
if questions_path.exists():
for file_path in questions_path.glob("*.md"):
lines = len(file_path.read_text().splitlines())
components["questions"].append({
"name": file_path.name,
"path": str(file_path.relative_to(Path("."))),
"lines": lines,
"size_kb": round(file_path.stat().st_size / 1024, 1)
})
components["total_lines"] += lines
components["total_files"] += 1
# Analyze strategy docs
strategy_path = base_path / "strategy"
if strategy_path.exists():
for file_path in strategy_path.glob("*.md"):
lines = len(file_path.read_text().splitlines())
components["strategy"].append({
"name": file_path.name,
"path": str(file_path.relative_to(Path("."))),
"lines": lines,
"size_kb": round(file_path.stat().st_size / 1024, 1)
})
components["total_lines"] += lines
components["total_files"] += 1
return components
def create_framework_readme(repo_id: str, components: Dict) -> str:
"""Create comprehensive README for dd-framework repository"""
# Calculate total size
total_size_kb = sum([
sum(item["size_kb"] for item in components["checklists"]),
sum(item["size_kb"] for item in components["questions"]),
sum(item["size_kb"] for item in components["strategy"])
])
checklist_details = "\n".join([
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
for item in components["checklists"]
])
questions_details = "\n".join([
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
for item in components["questions"]
])
strategy_details = "\n".join([
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
for item in components["strategy"]
])
return f"""---
language:
- en
license: mit
task_categories:
- question-answering
- document-question-answering
- text-classification
tags:
- due-diligence
- legal-framework
- financial-analysis
- m&a
- checklists
- methodology
size_categories:
- n<1K
---
# πŸ“‹ Due Diligence Framework
**Core methodology, checklists, and templates for AI-powered due diligence analysis**
This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies.
## 🎯 What's Included
### πŸ“‘ **Due Diligence Checklists** ({len(components["checklists"])} files)
Comprehensive checklists covering all aspects of M&A due diligence:
{checklist_details}
**Coverage Areas:**
- Organizational & Corporate Documents
- Financial & Accounting Records
- Legal Matters & Litigation
- Intellectual Property
- Employment & HR
- Operations & Commercial
- Technology & IT Systems
- Environmental & Regulatory
### ❓ **Question Templates** ({len(components["questions"])} files)
Structured question sets for systematic analysis:
{questions_details}
**Question Categories:**
- Corporate Structure & Governance
- Financial Performance & Accounting
- Legal & Compliance Matters
- Business Operations & Strategy
- Risk Assessment & Management
### 🎯 **Strategic Analysis Framework** ({len(components["strategy"])} files)
Real-world strategic analysis methodologies:
{strategy_details}
**Strategic Components:**
- M&A Target Assessment
- Market Positioning Analysis
- Technology Stack Evaluation
- Risk-Opportunity Matrix
## πŸ“Š **Dataset Statistics**
- **Total Files**: {components["total_files"]}
- **Total Lines**: {components["total_lines"]:,}
- **Total Size**: {total_size_kb:.1f}KB
- **Format**: Markdown (.md)
- **Language**: English
## πŸš€ **Quick Start**
### Load Individual Components
```python
from huggingface_hub import hf_hub_download
# Download Bloomberg checklist
bloomberg_checklist = hf_hub_download(
repo_id="{repo_id}",
filename="data/checklist/bloomberg.md"
)
# Download question templates
questions = hf_hub_download(
repo_id="{repo_id}",
filename="data/questions/due diligence.md"
)
# Download strategy framework
strategy = hf_hub_download(
repo_id="{repo_id}",
filename="data/strategy/rockman.md"
)
```
### Clone Entire Framework
```bash
git clone https://huggingface.co/datasets/{repo_id}
cd dd-framework
```
### Use with AI Systems
```python
# Example: Load checklist for RAG system
with open("data/checklist/bloomberg.md", "r") as f:
checklist_content = f.read()
# Parse checklist items
checklist_items = parse_checklist_items(checklist_content)
# Use for document matching, Q&A, etc.
relevant_docs = match_documents_to_checklist(checklist_items, document_corpus)
```
## πŸ”— **Related Datasets**
This framework is part of a complete due diligence toolkit:
- πŸ“‹ **[dd-framework](../dd-framework)** - Methodology and templates *(this repo)*
- ⚑ **[dd-indexes](../dd-indexes)** - Pre-computed search indexes
- πŸ“ **[dd-vdrs](../dd-vdrs)** - Virtual data room documents
## 🎨 **Use Cases**
### For Researchers
- **Legal NLP**: Train models on structured legal/financial templates
- **Question Generation**: Use templates for synthetic Q&A dataset creation
- **Document Classification**: Use checklists as taxonomy for document labeling
### For Developers
- **RAG Systems**: Use as knowledge base for due diligence chatbots
- **Checklist Matching**: Build automated document-to-requirement matching
- **Template Engine**: Generate custom checklists for different industries
### For Practitioners
- **Due Diligence Planning**: Ready-to-use checklists and question sets
- **Process Standardization**: Consistent methodology across engagements
- **Quality Assurance**: Comprehensive coverage verification
## πŸ“ˆ **Framework Structure**
```
data/
β”œβ”€β”€ checklist/
β”‚ β”œβ”€β”€ bloomberg.md # Bloomberg-style comprehensive checklist
β”‚ └── original.md # Traditional M&A checklist format
β”œβ”€β”€ questions/
β”‚ β”œβ”€β”€ due diligence.md # Core question templates
β”‚ └── expanded.md # Extended question variations
└── strategy/
β”œβ”€β”€ rockman.md # Strategic analysis methodology
└── rockman - alternative.md # Alternative approach
```
## 🏷️ **Methodology**
The framework follows established due diligence best practices:
1. **Comprehensive Coverage**: All critical business areas included
2. **Structured Format**: Consistent markdown formatting for easy parsing
3. **AI-Ready**: Optimized for integration with LLMs and RAG systems
4. **Industry-Standard**: Based on real-world M&A and investment practices
5. **Modular Design**: Components can be used independently or together
## βš–οΈ **Legal & Usage**
- **License**: MIT - Free for commercial and research use
- **Content**: Methodology and templates, no confidential data
- **Attribution**: Citation appreciated but not required
## πŸ“– **Citation**
If you use this framework in your research:
```bibtex
@dataset{{dd_framework_2024,
title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}},
author={{AI Due Diligence Project}},
year={{2024}},
publisher={{Hugging Face}},
url={{https://huggingface.co/datasets/{repo_id}}}
}}
```
## πŸ“§ **Contact**
Questions or suggestions? Open an issue or reach out!
---
*Part of the AI Due Diligence project - Making systematic business analysis accessible through AI*
"""
def prepare_framework_upload() -> Path:
"""Prepare upload directory with framework components only"""
upload_dir = Path("hf_framework_upload")
# Clean and create upload directory
if upload_dir.exists():
shutil.rmtree(upload_dir)
upload_dir.mkdir()
# Create data directory structure
data_dst = upload_dir / "data"
data_dst.mkdir()
# Copy framework components
components_to_copy = [
("data/checklist", "checklist"),
("data/questions", "questions"),
("data/strategy", "strategy")
]
for src_dir, dst_dir in components_to_copy:
src_path = Path(src_dir)
dst_path = data_dst / dst_dir
if src_path.exists():
shutil.copytree(src_path, dst_path)
print(f"βœ… Copied {src_dir} -> {dst_path}")
else:
print(f"⚠️ Skipped {src_dir} (not found)")
return upload_dir
def upload_framework(repo_id: str, token: str = None):
"""Upload dd-framework to Hugging Face Hub"""
print("πŸš€ Starting dd-framework upload...")
# Initialize HF API
api = HfApi(token=token)
# Create repository
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
token=token,
exist_ok=True,
private=False
)
print(f"βœ… Created/verified repository: {repo_id}")
except Exception as e:
print(f"❌ Error creating repository: {e}")
return False
# Analyze framework components
print("πŸ“Š Analyzing framework components...")
components = analyze_framework_components()
print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines")
# Prepare upload directory
print("πŸ“ Preparing framework files...")
upload_dir = prepare_framework_upload()
# Create README
print("πŸ“ Creating dataset card...")
readme_content = create_framework_readme(repo_id, components)
(upload_dir / "README.md").write_text(readme_content)
# Create metadata file
metadata = {
"repository": "dd-framework",
"description": "Due diligence methodology and templates",
"components": components,
"upload_date": datetime.now().isoformat(),
"version": "1.0.0",
"related_repositories": [
"dd-indexes",
"dd-vdrs"
]
}
(upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2))
# Upload
try:
print(f"πŸš€ Uploading to {repo_id}...")
upload_folder(
folder_path=upload_dir,
repo_id=repo_id,
repo_type="dataset",
token=token,
commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology"
)
print(f"βœ… Successfully uploaded to https://huggingface.co/datasets/{repo_id}")
print(f"πŸ“Š Uploaded {components['total_files']} files, {components['total_lines']:,} lines")
return True
except Exception as e:
print(f"❌ Upload failed: {e}")
return False
finally:
# Cleanup
if upload_dir.exists():
shutil.rmtree(upload_dir)
print("🧹 Cleaned up temporary files")
def main():
"""Main execution function"""
# Configuration
REPO_ID = "jmzlx/dd-framework"
HF_TOKEN = os.getenv("HF_TOKEN")
print("πŸ”§ DD-Framework Upload Configuration")
print(f"Repository: {REPO_ID}")
print(f"Token: {'βœ… Set' if HF_TOKEN else '❌ Missing'}")
print()
if not HF_TOKEN:
print("❌ Please set your HF_TOKEN environment variable")
print("1. Go to https://huggingface.co/settings/tokens")
print("2. Create a token with 'write' permissions")
print("3. Run: export HF_TOKEN='your_token_here'")
return
if REPO_ID == "your-username/dd-framework":
print("❌ Please update REPO_ID with your actual username!")
print("Edit this script and change the REPO_ID variable")
return
# Run upload
success = upload_framework(REPO_ID, HF_TOKEN)
if success:
print("\nπŸŽ‰ Upload completed successfully!")
print(f"πŸ”— View your dataset: https://huggingface.co/datasets/{REPO_ID}")
print(f"πŸ“‹ Next steps:")
print(f" - Review the dataset card")
print(f" - Test downloading components")
print(f" - Share with the community!")
else:
print("\nπŸ’₯ Upload failed - check error messages above")
if __name__ == "__main__":
main()