File size: 13,631 Bytes
9a71b8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
#!/usr/bin/env python3
"""
Upload script for dd-framework: Due Diligence Methodology and Templates
This uploads the core framework components: checklists, questions, and strategy docs
"""

import os
import json
import shutil
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder
from typing import Dict, List, Tuple

def analyze_framework_components() -> Dict:
    """Analyze the framework components and gather statistics"""
    base_path = Path("data")
    
    components = {
        "checklists": [],
        "questions": [], 
        "strategy": [],
        "total_files": 0,
        "total_lines": 0
    }
    
    # Analyze checklists
    checklist_path = base_path / "checklist"
    if checklist_path.exists():
        for file_path in checklist_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["checklists"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    # Analyze questions
    questions_path = base_path / "questions"
    if questions_path.exists():
        for file_path in questions_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["questions"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    # Analyze strategy docs
    strategy_path = base_path / "strategy"
    if strategy_path.exists():
        for file_path in strategy_path.glob("*.md"):
            lines = len(file_path.read_text().splitlines())
            components["strategy"].append({
                "name": file_path.name,
                "path": str(file_path.relative_to(Path("."))),
                "lines": lines,
                "size_kb": round(file_path.stat().st_size / 1024, 1)
            })
            components["total_lines"] += lines
            components["total_files"] += 1
    
    return components

def create_framework_readme(repo_id: str, components: Dict) -> str:
    """Create comprehensive README for dd-framework repository"""
    
    # Calculate total size
    total_size_kb = sum([
        sum(item["size_kb"] for item in components["checklists"]),
        sum(item["size_kb"] for item in components["questions"]),
        sum(item["size_kb"] for item in components["strategy"])
    ])
    
    checklist_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
        for item in components["checklists"]
    ])
    
    questions_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB" 
        for item in components["questions"]
    ])
    
    strategy_details = "\n".join([
        f"- **{item['name']}**: {item['lines']} lines, {item['size_kb']}KB"
        for item in components["strategy"]
    ])

    return f"""---
language:
- en
license: mit
task_categories:
- question-answering
- document-question-answering
- text-classification
tags:
- due-diligence
- legal-framework
- financial-analysis
- m&a
- checklists
- methodology
size_categories:
- n<1K
---

# πŸ“‹ Due Diligence Framework

**Core methodology, checklists, and templates for AI-powered due diligence analysis**

This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies.

## 🎯 What's Included

### πŸ“‘ **Due Diligence Checklists** ({len(components["checklists"])} files)
Comprehensive checklists covering all aspects of M&A due diligence:

{checklist_details}

**Coverage Areas:**
- Organizational & Corporate Documents
- Financial & Accounting Records  
- Legal Matters & Litigation
- Intellectual Property
- Employment & HR
- Operations & Commercial
- Technology & IT Systems
- Environmental & Regulatory

### ❓ **Question Templates** ({len(components["questions"])} files)
Structured question sets for systematic analysis:

{questions_details}

**Question Categories:**
- Corporate Structure & Governance
- Financial Performance & Accounting
- Legal & Compliance Matters
- Business Operations & Strategy
- Risk Assessment & Management

### 🎯 **Strategic Analysis Framework** ({len(components["strategy"])} files)
Real-world strategic analysis methodologies:

{strategy_details}

**Strategic Components:**
- M&A Target Assessment
- Market Positioning Analysis
- Technology Stack Evaluation
- Risk-Opportunity Matrix

## πŸ“Š **Dataset Statistics**

- **Total Files**: {components["total_files"]}
- **Total Lines**: {components["total_lines"]:,}
- **Total Size**: {total_size_kb:.1f}KB
- **Format**: Markdown (.md)
- **Language**: English

## πŸš€ **Quick Start**

### Load Individual Components

```python
from huggingface_hub import hf_hub_download

# Download Bloomberg checklist
bloomberg_checklist = hf_hub_download(
    repo_id="{repo_id}",
    filename="data/checklist/bloomberg.md"
)

# Download question templates  
questions = hf_hub_download(
    repo_id="{repo_id}",
    filename="data/questions/due diligence.md"
)

# Download strategy framework
strategy = hf_hub_download(
    repo_id="{repo_id}", 
    filename="data/strategy/rockman.md"
)
```

### Clone Entire Framework

```bash
git clone https://huggingface.co/datasets/{repo_id}
cd dd-framework
```

### Use with AI Systems

```python
# Example: Load checklist for RAG system
with open("data/checklist/bloomberg.md", "r") as f:
    checklist_content = f.read()

# Parse checklist items
checklist_items = parse_checklist_items(checklist_content)

# Use for document matching, Q&A, etc.
relevant_docs = match_documents_to_checklist(checklist_items, document_corpus)
```

## πŸ”— **Related Datasets**

This framework is part of a complete due diligence toolkit:

- πŸ“‹ **[dd-framework](../dd-framework)** - Methodology and templates *(this repo)*
- ⚑ **[dd-indexes](../dd-indexes)** - Pre-computed search indexes
- πŸ“ **[dd-vdrs](../dd-vdrs)** - Virtual data room documents

## 🎨 **Use Cases**

### For Researchers
- **Legal NLP**: Train models on structured legal/financial templates
- **Question Generation**: Use templates for synthetic Q&A dataset creation
- **Document Classification**: Use checklists as taxonomy for document labeling

### For Developers
- **RAG Systems**: Use as knowledge base for due diligence chatbots
- **Checklist Matching**: Build automated document-to-requirement matching
- **Template Engine**: Generate custom checklists for different industries

### For Practitioners
- **Due Diligence Planning**: Ready-to-use checklists and question sets
- **Process Standardization**: Consistent methodology across engagements
- **Quality Assurance**: Comprehensive coverage verification

## πŸ“ˆ **Framework Structure**

```
data/
β”œβ”€β”€ checklist/
β”‚   β”œβ”€β”€ bloomberg.md          # Bloomberg-style comprehensive checklist
β”‚   └── original.md           # Traditional M&A checklist format
β”œβ”€β”€ questions/  
β”‚   β”œβ”€β”€ due diligence.md      # Core question templates
β”‚   └── expanded.md           # Extended question variations
└── strategy/
    β”œβ”€β”€ rockman.md            # Strategic analysis methodology
    └── rockman - alternative.md  # Alternative approach
```

## 🏷️ **Methodology**

The framework follows established due diligence best practices:

1. **Comprehensive Coverage**: All critical business areas included
2. **Structured Format**: Consistent markdown formatting for easy parsing
3. **AI-Ready**: Optimized for integration with LLMs and RAG systems
4. **Industry-Standard**: Based on real-world M&A and investment practices
5. **Modular Design**: Components can be used independently or together

## βš–οΈ **Legal & Usage**

- **License**: MIT - Free for commercial and research use
- **Content**: Methodology and templates, no confidential data
- **Attribution**: Citation appreciated but not required

## πŸ“– **Citation**

If you use this framework in your research:

```bibtex
@dataset{{dd_framework_2024,
  title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}},
  author={{AI Due Diligence Project}},
  year={{2024}},
  publisher={{Hugging Face}},
  url={{https://huggingface.co/datasets/{repo_id}}}
}}
```

## πŸ“§ **Contact**

Questions or suggestions? Open an issue or reach out!

---

*Part of the AI Due Diligence project - Making systematic business analysis accessible through AI*
"""

def prepare_framework_upload() -> Path:
    """Prepare upload directory with framework components only"""
    upload_dir = Path("hf_framework_upload")
    
    # Clean and create upload directory
    if upload_dir.exists():
        shutil.rmtree(upload_dir)
    upload_dir.mkdir()
    
    # Create data directory structure
    data_dst = upload_dir / "data"
    data_dst.mkdir()
    
    # Copy framework components
    components_to_copy = [
        ("data/checklist", "checklist"),
        ("data/questions", "questions"), 
        ("data/strategy", "strategy")
    ]
    
    for src_dir, dst_dir in components_to_copy:
        src_path = Path(src_dir)
        dst_path = data_dst / dst_dir
        
        if src_path.exists():
            shutil.copytree(src_path, dst_path)
            print(f"βœ… Copied {src_dir} -> {dst_path}")
        else:
            print(f"⚠️  Skipped {src_dir} (not found)")
    
    return upload_dir

def upload_framework(repo_id: str, token: str = None):
    """Upload dd-framework to Hugging Face Hub"""
    
    print("πŸš€ Starting dd-framework upload...")
    
    # Initialize HF API
    api = HfApi(token=token)
    
    # Create repository
    try:
        create_repo(
            repo_id=repo_id,
            repo_type="dataset", 
            token=token,
            exist_ok=True,
            private=False
        )
        print(f"βœ… Created/verified repository: {repo_id}")
    except Exception as e:
        print(f"❌ Error creating repository: {e}")
        return False
    
    # Analyze framework components
    print("πŸ“Š Analyzing framework components...")
    components = analyze_framework_components()
    print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines")
    
    # Prepare upload directory
    print("πŸ“ Preparing framework files...")
    upload_dir = prepare_framework_upload()
    
    # Create README
    print("πŸ“ Creating dataset card...")
    readme_content = create_framework_readme(repo_id, components)
    (upload_dir / "README.md").write_text(readme_content)
    
    # Create metadata file
    metadata = {
        "repository": "dd-framework",
        "description": "Due diligence methodology and templates", 
        "components": components,
        "upload_date": datetime.now().isoformat(),
        "version": "1.0.0",
        "related_repositories": [
            "dd-indexes",
            "dd-vdrs"  
        ]
    }
    (upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2))
    
    # Upload
    try:
        print(f"πŸš€ Uploading to {repo_id}...")
        upload_folder(
            folder_path=upload_dir,
            repo_id=repo_id,
            repo_type="dataset",
            token=token,
            commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology"
        )
        print(f"βœ… Successfully uploaded to https://huggingface.co/datasets/{repo_id}")
        print(f"πŸ“Š Uploaded {components['total_files']} files, {components['total_lines']:,} lines")
        return True
        
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False
        
    finally:
        # Cleanup
        if upload_dir.exists():
            shutil.rmtree(upload_dir)
            print("🧹 Cleaned up temporary files")

def main():
    """Main execution function"""
    
    # Configuration
    REPO_ID = "jmzlx/dd-framework"
    HF_TOKEN = os.getenv("HF_TOKEN")
    
    print("πŸ”§ DD-Framework Upload Configuration")
    print(f"Repository: {REPO_ID}")
    print(f"Token: {'βœ… Set' if HF_TOKEN else '❌ Missing'}")
    print()
    
    if not HF_TOKEN:
        print("❌ Please set your HF_TOKEN environment variable")
        print("1. Go to https://huggingface.co/settings/tokens")
        print("2. Create a token with 'write' permissions") 
        print("3. Run: export HF_TOKEN='your_token_here'")
        return
    
    if REPO_ID == "your-username/dd-framework":
        print("❌ Please update REPO_ID with your actual username!")
        print("Edit this script and change the REPO_ID variable")
        return
    
    # Run upload
    success = upload_framework(REPO_ID, HF_TOKEN)
    
    if success:
        print("\nπŸŽ‰ Upload completed successfully!")
        print(f"πŸ”— View your dataset: https://huggingface.co/datasets/{REPO_ID}")
        print(f"πŸ“‹ Next steps:")
        print(f"   - Review the dataset card")
        print(f"   - Test downloading components") 
        print(f"   - Share with the community!")
    else:
        print("\nπŸ’₯ Upload failed - check error messages above")

if __name__ == "__main__":
    main()