|
|
|
|
|
""" |
|
|
Upload script for dd-framework: Due Diligence Methodology and Templates |
|
|
This uploads the core framework components: checklists, questions, and strategy docs |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
from huggingface_hub import HfApi, create_repo, upload_folder |
|
|
from typing import Dict, List, Tuple |
|
|
|
|
|
def analyze_framework_components() -> Dict: |
|
|
"""Analyze the framework components and gather statistics""" |
|
|
base_path = Path("data") |
|
|
|
|
|
components = { |
|
|
"checklists": [], |
|
|
"questions": [], |
|
|
"strategy": [], |
|
|
"total_files": 0, |
|
|
"total_lines": 0 |
|
|
} |
|
|
|
|
|
|
|
|
checklist_path = base_path / "checklist" |
|
|
if checklist_path.exists(): |
|
|
for file_path in checklist_path.glob("*.md"): |
|
|
lines = len(file_path.read_text().splitlines()) |
|
|
components["checklists"].append({ |
|
|
"name": file_path.name, |
|
|
"path": str(file_path.relative_to(Path("."))), |
|
|
"lines": lines, |
|
|
"size_kb": round(file_path.stat().st_size / 1024, 1) |
|
|
}) |
|
|
components["total_lines"] += lines |
|
|
components["total_files"] += 1 |
|
|
|
|
|
|
|
|
questions_path = base_path / "questions" |
|
|
if questions_path.exists(): |
|
|
for file_path in questions_path.glob("*.md"): |
|
|
lines = len(file_path.read_text().splitlines()) |
|
|
components["questions"].append({ |
|
|
"name": file_path.name, |
|
|
"path": str(file_path.relative_to(Path("."))), |
|
|
"lines": lines, |
|
|
"size_kb": round(file_path.stat().st_size / 1024, 1) |
|
|
}) |
|
|
components["total_lines"] += lines |
|
|
components["total_files"] += 1 |
|
|
|
|
|
|
|
|
strategy_path = base_path / "strategy" |
|
|
if strategy_path.exists(): |
|
|
for file_path in strategy_path.glob("*.md"): |
|
|
lines = len(file_path.read_text().splitlines()) |
|
|
components["strategy"].append({ |
|
|
"name": file_path.name, |
|
|
"path": str(file_path.relative_to(Path("."))), |
|
|
"lines": lines, |
|
|
"size_kb": round(file_path.stat().st_size / 1024, 1) |
|
|
}) |
|
|
components["total_lines"] += lines |
|
|
components["total_files"] += 1 |
|
|
|
|
|
return components |
|
|
|
|
|
def create_framework_readme(repo_id: str, components: Dict) -> str: |
|
|
"""Create comprehensive README for dd-framework repository""" |
|
|
|
|
|
|
|
|
total_size_kb = sum([ |
|
|
sum(item["size_kb"] for item in components["checklists"]), |
|
|
sum(item["size_kb"] for item in components["questions"]), |
|
|
sum(item["size_kb"] for item in components["strategy"]) |
|
|
]) |
|
|
|
|
|
checklist_details = "\n".join([ |
|
|
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" |
|
|
for item in components["checklists"] |
|
|
]) |
|
|
|
|
|
questions_details = "\n".join([ |
|
|
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" |
|
|
for item in components["questions"] |
|
|
]) |
|
|
|
|
|
strategy_details = "\n".join([ |
|
|
f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" |
|
|
for item in components["strategy"] |
|
|
]) |
|
|
|
|
|
return f"""--- |
|
|
language: |
|
|
- en |
|
|
license: mit |
|
|
task_categories: |
|
|
- question-answering |
|
|
- document-question-answering |
|
|
- text-classification |
|
|
tags: |
|
|
- due-diligence |
|
|
- legal-framework |
|
|
- financial-analysis |
|
|
- m&a |
|
|
- checklists |
|
|
- methodology |
|
|
size_categories: |
|
|
- n<1K |
|
|
--- |
|
|
|
|
|
# π Due Diligence Framework |
|
|
|
|
|
**Core methodology, checklists, and templates for AI-powered due diligence analysis** |
|
|
|
|
|
This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies. |
|
|
|
|
|
## π― What's Included |
|
|
|
|
|
### π **Due Diligence Checklists** ({len(components["checklists"])} files) |
|
|
Comprehensive checklists covering all aspects of M&A due diligence: |
|
|
|
|
|
{checklist_details} |
|
|
|
|
|
**Coverage Areas:** |
|
|
- Organizational & Corporate Documents |
|
|
- Financial & Accounting Records |
|
|
- Legal Matters & Litigation |
|
|
- Intellectual Property |
|
|
- Employment & HR |
|
|
- Operations & Commercial |
|
|
- Technology & IT Systems |
|
|
- Environmental & Regulatory |
|
|
|
|
|
### β **Question Templates** ({len(components["questions"])} files) |
|
|
Structured question sets for systematic analysis: |
|
|
|
|
|
{questions_details} |
|
|
|
|
|
**Question Categories:** |
|
|
- Corporate Structure & Governance |
|
|
- Financial Performance & Accounting |
|
|
- Legal & Compliance Matters |
|
|
- Business Operations & Strategy |
|
|
- Risk Assessment & Management |
|
|
|
|
|
### π― **Strategic Analysis Framework** ({len(components["strategy"])} files) |
|
|
Real-world strategic analysis methodologies: |
|
|
|
|
|
{strategy_details} |
|
|
|
|
|
**Strategic Components:** |
|
|
- M&A Target Assessment |
|
|
- Market Positioning Analysis |
|
|
- Technology Stack Evaluation |
|
|
- Risk-Opportunity Matrix |
|
|
|
|
|
## π **Dataset Statistics** |
|
|
|
|
|
- **Total Files**: {components["total_files"]} |
|
|
- **Total Lines**: {components["total_lines"]:,} |
|
|
- **Total Size**: {total_size_kb:.1f}KB |
|
|
- **Format**: Markdown (.md) |
|
|
- **Language**: English |
|
|
|
|
|
## π **Quick Start** |
|
|
|
|
|
### Load Individual Components |
|
|
|
|
|
```python |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
# Download Bloomberg checklist |
|
|
bloomberg_checklist = hf_hub_download( |
|
|
repo_id="{repo_id}", |
|
|
filename="data/checklist/bloomberg.md" |
|
|
) |
|
|
|
|
|
# Download question templates |
|
|
questions = hf_hub_download( |
|
|
repo_id="{repo_id}", |
|
|
filename="data/questions/due diligence.md" |
|
|
) |
|
|
|
|
|
# Download strategy framework |
|
|
strategy = hf_hub_download( |
|
|
repo_id="{repo_id}", |
|
|
filename="data/strategy/rockman.md" |
|
|
) |
|
|
``` |
|
|
|
|
|
### Clone Entire Framework |
|
|
|
|
|
```bash |
|
|
git clone https://huggingface.co/datasets/{repo_id} |
|
|
cd dd-framework |
|
|
``` |
|
|
|
|
|
### Use with AI Systems |
|
|
|
|
|
```python |
|
|
# Example: Load checklist for RAG system |
|
|
with open("data/checklist/bloomberg.md", "r") as f: |
|
|
checklist_content = f.read() |
|
|
|
|
|
# Parse checklist items |
|
|
checklist_items = parse_checklist_items(checklist_content) |
|
|
|
|
|
# Use for document matching, Q&A, etc. |
|
|
relevant_docs = match_documents_to_checklist(checklist_items, document_corpus) |
|
|
``` |
|
|
|
|
|
## π **Related Datasets** |
|
|
|
|
|
This framework is part of a complete due diligence toolkit: |
|
|
|
|
|
- π **[dd-framework](../dd-framework)** - Methodology and templates *(this repo)* |
|
|
- β‘ **[dd-indexes](../dd-indexes)** - Pre-computed search indexes |
|
|
- π **[dd-vdrs](../dd-vdrs)** - Virtual data room documents |
|
|
|
|
|
## π¨ **Use Cases** |
|
|
|
|
|
### For Researchers |
|
|
- **Legal NLP**: Train models on structured legal/financial templates |
|
|
- **Question Generation**: Use templates for synthetic Q&A dataset creation |
|
|
- **Document Classification**: Use checklists as taxonomy for document labeling |
|
|
|
|
|
### For Developers |
|
|
- **RAG Systems**: Use as knowledge base for due diligence chatbots |
|
|
- **Checklist Matching**: Build automated document-to-requirement matching |
|
|
- **Template Engine**: Generate custom checklists for different industries |
|
|
|
|
|
### For Practitioners |
|
|
- **Due Diligence Planning**: Ready-to-use checklists and question sets |
|
|
- **Process Standardization**: Consistent methodology across engagements |
|
|
- **Quality Assurance**: Comprehensive coverage verification |
|
|
|
|
|
## π **Framework Structure** |
|
|
|
|
|
``` |
|
|
data/ |
|
|
βββ checklist/ |
|
|
β βββ bloomberg.md # Bloomberg-style comprehensive checklist |
|
|
β βββ original.md # Traditional M&A checklist format |
|
|
βββ questions/ |
|
|
β βββ due diligence.md # Core question templates |
|
|
β βββ expanded.md # Extended question variations |
|
|
βββ strategy/ |
|
|
βββ rockman.md # Strategic analysis methodology |
|
|
βββ rockman - alternative.md # Alternative approach |
|
|
``` |
|
|
|
|
|
## π·οΈ **Methodology** |
|
|
|
|
|
The framework follows established due diligence best practices: |
|
|
|
|
|
1. **Comprehensive Coverage**: All critical business areas included |
|
|
2. **Structured Format**: Consistent markdown formatting for easy parsing |
|
|
3. **AI-Ready**: Optimized for integration with LLMs and RAG systems |
|
|
4. **Industry-Standard**: Based on real-world M&A and investment practices |
|
|
5. **Modular Design**: Components can be used independently or together |
|
|
|
|
|
## βοΈ **Legal & Usage** |
|
|
|
|
|
- **License**: MIT - Free for commercial and research use |
|
|
- **Content**: Methodology and templates, no confidential data |
|
|
- **Attribution**: Citation appreciated but not required |
|
|
|
|
|
## π **Citation** |
|
|
|
|
|
If you use this framework in your research: |
|
|
|
|
|
```bibtex |
|
|
@dataset{{dd_framework_2024, |
|
|
title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}}, |
|
|
author={{AI Due Diligence Project}}, |
|
|
year={{2024}}, |
|
|
publisher={{Hugging Face}}, |
|
|
url={{https://huggingface.co/datasets/{repo_id}}} |
|
|
}} |
|
|
``` |
|
|
|
|
|
## π§ **Contact** |
|
|
|
|
|
Questions or suggestions? Open an issue or reach out! |
|
|
|
|
|
--- |
|
|
|
|
|
*Part of the AI Due Diligence project - Making systematic business analysis accessible through AI* |
|
|
""" |
|
|
|
|
|
def prepare_framework_upload() -> Path: |
|
|
"""Prepare upload directory with framework components only""" |
|
|
upload_dir = Path("hf_framework_upload") |
|
|
|
|
|
|
|
|
if upload_dir.exists(): |
|
|
shutil.rmtree(upload_dir) |
|
|
upload_dir.mkdir() |
|
|
|
|
|
|
|
|
data_dst = upload_dir / "data" |
|
|
data_dst.mkdir() |
|
|
|
|
|
|
|
|
components_to_copy = [ |
|
|
("data/checklist", "checklist"), |
|
|
("data/questions", "questions"), |
|
|
("data/strategy", "strategy") |
|
|
] |
|
|
|
|
|
for src_dir, dst_dir in components_to_copy: |
|
|
src_path = Path(src_dir) |
|
|
dst_path = data_dst / dst_dir |
|
|
|
|
|
if src_path.exists(): |
|
|
shutil.copytree(src_path, dst_path) |
|
|
print(f"β
Copied {src_dir} -> {dst_path}") |
|
|
else: |
|
|
print(f"β οΈ Skipped {src_dir} (not found)") |
|
|
|
|
|
return upload_dir |
|
|
|
|
|
def upload_framework(repo_id: str, token: str = None): |
|
|
"""Upload dd-framework to Hugging Face Hub""" |
|
|
|
|
|
print("π Starting dd-framework upload...") |
|
|
|
|
|
|
|
|
api = HfApi(token=token) |
|
|
|
|
|
|
|
|
try: |
|
|
create_repo( |
|
|
repo_id=repo_id, |
|
|
repo_type="dataset", |
|
|
token=token, |
|
|
exist_ok=True, |
|
|
private=False |
|
|
) |
|
|
print(f"β
Created/verified repository: {repo_id}") |
|
|
except Exception as e: |
|
|
print(f"β Error creating repository: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("π Analyzing framework components...") |
|
|
components = analyze_framework_components() |
|
|
print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines") |
|
|
|
|
|
|
|
|
print("π Preparing framework files...") |
|
|
upload_dir = prepare_framework_upload() |
|
|
|
|
|
|
|
|
print("π Creating dataset card...") |
|
|
readme_content = create_framework_readme(repo_id, components) |
|
|
(upload_dir / "README.md").write_text(readme_content) |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"repository": "dd-framework", |
|
|
"description": "Due diligence methodology and templates", |
|
|
"components": components, |
|
|
"upload_date": datetime.now().isoformat(), |
|
|
"version": "1.0.0", |
|
|
"related_repositories": [ |
|
|
"dd-indexes", |
|
|
"dd-vdrs" |
|
|
] |
|
|
} |
|
|
(upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2)) |
|
|
|
|
|
|
|
|
try: |
|
|
print(f"π Uploading to {repo_id}...") |
|
|
upload_folder( |
|
|
folder_path=upload_dir, |
|
|
repo_id=repo_id, |
|
|
repo_type="dataset", |
|
|
token=token, |
|
|
commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology" |
|
|
) |
|
|
print(f"β
Successfully uploaded to https://huggingface.co/datasets/{repo_id}") |
|
|
print(f"π Uploaded {components['total_files']} files, {components['total_lines']:,} lines") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Upload failed: {e}") |
|
|
return False |
|
|
|
|
|
finally: |
|
|
|
|
|
if upload_dir.exists(): |
|
|
shutil.rmtree(upload_dir) |
|
|
print("π§Ή Cleaned up temporary files") |
|
|
|
|
|
def main(): |
|
|
"""Main execution function""" |
|
|
|
|
|
|
|
|
REPO_ID = "jmzlx/dd-framework" |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
print("π§ DD-Framework Upload Configuration") |
|
|
print(f"Repository: {REPO_ID}") |
|
|
print(f"Token: {'β
Set' if HF_TOKEN else 'β Missing'}") |
|
|
print() |
|
|
|
|
|
if not HF_TOKEN: |
|
|
print("β Please set your HF_TOKEN environment variable") |
|
|
print("1. Go to https://huggingface.co/settings/tokens") |
|
|
print("2. Create a token with 'write' permissions") |
|
|
print("3. Run: export HF_TOKEN='your_token_here'") |
|
|
return |
|
|
|
|
|
if REPO_ID == "your-username/dd-framework": |
|
|
print("β Please update REPO_ID with your actual username!") |
|
|
print("Edit this script and change the REPO_ID variable") |
|
|
return |
|
|
|
|
|
|
|
|
success = upload_framework(REPO_ID, HF_TOKEN) |
|
|
|
|
|
if success: |
|
|
print("\nπ Upload completed successfully!") |
|
|
print(f"π View your dataset: https://huggingface.co/datasets/{REPO_ID}") |
|
|
print(f"π Next steps:") |
|
|
print(f" - Review the dataset card") |
|
|
print(f" - Test downloading components") |
|
|
print(f" - Share with the community!") |
|
|
else: |
|
|
print("\nπ₯ Upload failed - check error messages above") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|