Spaces:

jmzlx
/

dd-poc

Sleeping

dd-poc / scripts /upload_dd_framework.py

Juan Salas

🚀 Refactor: Move datasets to HuggingFace repos

9a71b8f 4 months ago

13.6 kB

	#!/usr/bin/env python3
	"""
	Upload script for dd-framework: Due Diligence Methodology and Templates
	This uploads the core framework components: checklists, questions, and strategy docs
	"""

	import os
	import json
	import shutil
	from pathlib import Path
	from datetime import datetime
	from huggingface_hub import HfApi, create_repo, upload_folder
	from typing import Dict, List, Tuple

	def analyze_framework_components() -> Dict:
	"""Analyze the framework components and gather statistics"""
	base_path = Path("data")

	components = {
	"checklists": [],
	"questions": [],
	"strategy": [],
	"total_files": 0,
	"total_lines": 0
	}

	# Analyze checklists
	checklist_path = base_path / "checklist"
	if checklist_path.exists():
	for file_path in checklist_path.glob("*.md"):
	lines = len(file_path.read_text().splitlines())
	components["checklists"].append({
	"name": file_path.name,
	"path": str(file_path.relative_to(Path("."))),
	"lines": lines,
	"size_kb": round(file_path.stat().st_size / 1024, 1)
	})
	components["total_lines"] += lines
	components["total_files"] += 1

	# Analyze questions
	questions_path = base_path / "questions"
	if questions_path.exists():
	for file_path in questions_path.glob("*.md"):
	lines = len(file_path.read_text().splitlines())
	components["questions"].append({
	"name": file_path.name,
	"path": str(file_path.relative_to(Path("."))),
	"lines": lines,
	"size_kb": round(file_path.stat().st_size / 1024, 1)
	})
	components["total_lines"] += lines
	components["total_files"] += 1

	# Analyze strategy docs
	strategy_path = base_path / "strategy"
	if strategy_path.exists():
	for file_path in strategy_path.glob("*.md"):
	lines = len(file_path.read_text().splitlines())
	components["strategy"].append({
	"name": file_path.name,
	"path": str(file_path.relative_to(Path("."))),
	"lines": lines,
	"size_kb": round(file_path.stat().st_size / 1024, 1)
	})
	components["total_lines"] += lines
	components["total_files"] += 1

	return components

	def create_framework_readme(repo_id: str, components: Dict) -> str:
	"""Create comprehensive README for dd-framework repository"""

	# Calculate total size
	total_size_kb = sum([
	sum(item["size_kb"] for item in components["checklists"]),
	sum(item["size_kb"] for item in components["questions"]),
	sum(item["size_kb"] for item in components["strategy"])
	])

	checklist_details = "\n".join([
	f"- {item['name']}: {item['lines']} lines, {item['size_kb']}KB"
	for item in components["checklists"]
	])

	questions_details = "\n".join([
	f"- {item['name']}: {item['lines']} lines, {item['size_kb']}KB"
	for item in components["questions"]
	])

	strategy_details = "\n".join([
	f"- {item['name']}: {item['lines']} lines, {item['size_kb']}KB"
	for item in components["strategy"]
	])

	return f"""---
	language:
	- en
	license: mit
	task_categories:
	- question-answering
	- document-question-answering
	- text-classification
	tags:
	- due-diligence
	- legal-framework
	- financial-analysis
	- m&a
	- checklists
	- methodology
	size_categories:
	- n<1K
	---

	# 📋 Due Diligence Framework

	Core methodology, checklists, and templates for AI-powered due diligence analysis

	This repository contains the foundational framework components for systematic due diligence analysis, including comprehensive checklists, structured question templates, and strategic analysis methodologies.

	## 🎯 What's Included

	### 📑 Due Diligence Checklists ({len(components["checklists"])} files)
	Comprehensive checklists covering all aspects of M&A due diligence:

	{checklist_details}

	Coverage Areas:
	- Organizational & Corporate Documents
	- Financial & Accounting Records
	- Legal Matters & Litigation
	- Intellectual Property
	- Employment & HR
	- Operations & Commercial
	- Technology & IT Systems
	- Environmental & Regulatory

	### ❓ Question Templates ({len(components["questions"])} files)
	Structured question sets for systematic analysis:

	{questions_details}

	Question Categories:
	- Corporate Structure & Governance
	- Financial Performance & Accounting
	- Legal & Compliance Matters
	- Business Operations & Strategy
	- Risk Assessment & Management

	### 🎯 Strategic Analysis Framework ({len(components["strategy"])} files)
	Real-world strategic analysis methodologies:

	{strategy_details}

	Strategic Components:
	- M&A Target Assessment
	- Market Positioning Analysis
	- Technology Stack Evaluation
	- Risk-Opportunity Matrix

	## 📊 Dataset Statistics

	- Total Files: {components["total_files"]}
	- Total Lines: {components["total_lines"]:,}
	- Total Size: {total_size_kb:.1f}KB
	- Format: Markdown (.md)
	- Language: English

	## 🚀 Quick Start

	### Load Individual Components

	```python
	from huggingface_hub import hf_hub_download

	# Download Bloomberg checklist
	bloomberg_checklist = hf_hub_download(
	repo_id="{repo_id}",
	filename="data/checklist/bloomberg.md"
	)

	# Download question templates
	questions = hf_hub_download(
	repo_id="{repo_id}",
	filename="data/questions/due diligence.md"
	)

	# Download strategy framework
	strategy = hf_hub_download(
	repo_id="{repo_id}",
	filename="data/strategy/rockman.md"
	)
	```

	### Clone Entire Framework

	```bash
	git clone https://huggingface.co/datasets/{repo_id}
	cd dd-framework
	```

	### Use with AI Systems

	```python
	# Example: Load checklist for RAG system
	with open("data/checklist/bloomberg.md", "r") as f:
	checklist_content = f.read()

	# Parse checklist items
	checklist_items = parse_checklist_items(checklist_content)

	# Use for document matching, Q&A, etc.
	relevant_docs = match_documents_to_checklist(checklist_items, document_corpus)
	```

	## 🔗 Related Datasets

	This framework is part of a complete due diligence toolkit:

	- 📋 [dd-framework](../dd-framework) - Methodology and templates (this repo)
	- ⚡ [dd-indexes](../dd-indexes) - Pre-computed search indexes
	- 📁 [dd-vdrs](../dd-vdrs) - Virtual data room documents

	## 🎨 Use Cases

	### For Researchers
	- Legal NLP: Train models on structured legal/financial templates
	- Question Generation: Use templates for synthetic Q&A dataset creation
	- Document Classification: Use checklists as taxonomy for document labeling

	### For Developers
	- RAG Systems: Use as knowledge base for due diligence chatbots
	- Checklist Matching: Build automated document-to-requirement matching
	- Template Engine: Generate custom checklists for different industries

	### For Practitioners
	- Due Diligence Planning: Ready-to-use checklists and question sets
	- Process Standardization: Consistent methodology across engagements
	- Quality Assurance: Comprehensive coverage verification

	## 📈 Framework Structure

	```
	data/
	├── checklist/
	│ ├── bloomberg.md # Bloomberg-style comprehensive checklist
	│ └── original.md # Traditional M&A checklist format
	├── questions/
	│ ├── due diligence.md # Core question templates
	│ └── expanded.md # Extended question variations
	└── strategy/
	├── rockman.md # Strategic analysis methodology
	└── rockman - alternative.md # Alternative approach
	```

	## 🏷️ Methodology

	The framework follows established due diligence best practices:

	1. Comprehensive Coverage: All critical business areas included
	2. Structured Format: Consistent markdown formatting for easy parsing
	3. AI-Ready: Optimized for integration with LLMs and RAG systems
	4. Industry-Standard: Based on real-world M&A and investment practices
	5. Modular Design: Components can be used independently or together

	## ⚖️ Legal & Usage

	- License: MIT - Free for commercial and research use
	- Content: Methodology and templates, no confidential data
	- Attribution: Citation appreciated but not required

	## 📖 Citation

	If you use this framework in your research:

	```bibtex
	@dataset{{dd_framework_2024,
	title={{Due Diligence Framework: Methodology and Templates for AI-Powered Analysis}},
	author={{AI Due Diligence Project}},
	year={{2024}},
	publisher={{Hugging Face}},
	url={{https://huggingface.co/datasets/{repo_id}}}
	}}
	```

	## 📧 Contact

	Questions or suggestions? Open an issue or reach out!

	---

	Part of the AI Due Diligence project - Making systematic business analysis accessible through AI
	"""

	def prepare_framework_upload() -> Path:
	"""Prepare upload directory with framework components only"""
	upload_dir = Path("hf_framework_upload")

	# Clean and create upload directory
	if upload_dir.exists():
	shutil.rmtree(upload_dir)
	upload_dir.mkdir()

	# Create data directory structure
	data_dst = upload_dir / "data"
	data_dst.mkdir()

	# Copy framework components
	components_to_copy = [
	("data/checklist", "checklist"),
	("data/questions", "questions"),
	("data/strategy", "strategy")
	]

	for src_dir, dst_dir in components_to_copy:
	src_path = Path(src_dir)
	dst_path = data_dst / dst_dir

	if src_path.exists():
	shutil.copytree(src_path, dst_path)
	print(f"✅ Copied {src_dir} -> {dst_path}")
	else:
	print(f"⚠️ Skipped {src_dir} (not found)")

	return upload_dir

	def upload_framework(repo_id: str, token: str = None):
	"""Upload dd-framework to Hugging Face Hub"""

	print("🚀 Starting dd-framework upload...")

	# Initialize HF API
	api = HfApi(token=token)

	# Create repository
	try:
	create_repo(
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	exist_ok=True,
	private=False
	)
	print(f"✅ Created/verified repository: {repo_id}")
	except Exception as e:
	print(f"❌ Error creating repository: {e}")
	return False

	# Analyze framework components
	print("📊 Analyzing framework components...")
	components = analyze_framework_components()
	print(f"Found {components['total_files']} files with {components['total_lines']:,} total lines")

	# Prepare upload directory
	print("📁 Preparing framework files...")
	upload_dir = prepare_framework_upload()

	# Create README
	print("📝 Creating dataset card...")
	readme_content = create_framework_readme(repo_id, components)
	(upload_dir / "README.md").write_text(readme_content)

	# Create metadata file
	metadata = {
	"repository": "dd-framework",
	"description": "Due diligence methodology and templates",
	"components": components,
	"upload_date": datetime.now().isoformat(),
	"version": "1.0.0",
	"related_repositories": [
	"dd-indexes",
	"dd-vdrs"
	]
	}
	(upload_dir / "framework_metadata.json").write_text(json.dumps(metadata, indent=2))

	# Upload
	try:
	print(f"🚀 Uploading to {repo_id}...")
	upload_folder(
	folder_path=upload_dir,
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	commit_message="Upload dd-framework v1.0.0 - Core due diligence methodology"
	)
	print(f"✅ Successfully uploaded to https://huggingface.co/datasets/{repo_id}")
	print(f"📊 Uploaded {components['total_files']} files, {components['total_lines']:,} lines")
	return True

	except Exception as e:
	print(f"❌ Upload failed: {e}")
	return False

	finally:
	# Cleanup
	if upload_dir.exists():
	shutil.rmtree(upload_dir)
	print("🧹 Cleaned up temporary files")

	def main():
	"""Main execution function"""

	# Configuration
	REPO_ID = "jmzlx/dd-framework"
	HF_TOKEN = os.getenv("HF_TOKEN")

	print("🔧 DD-Framework Upload Configuration")
	print(f"Repository: {REPO_ID}")
	print(f"Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
	print()

	if not HF_TOKEN:
	print("❌ Please set your HF_TOKEN environment variable")
	print("1. Go to https://huggingface.co/settings/tokens")
	print("2. Create a token with 'write' permissions")
	print("3. Run: export HF_TOKEN='your_token_here'")
	return

	if REPO_ID == "your-username/dd-framework":
	print("❌ Please update REPO_ID with your actual username!")
	print("Edit this script and change the REPO_ID variable")
	return

	# Run upload
	success = upload_framework(REPO_ID, HF_TOKEN)

	if success:
	print("\n🎉 Upload completed successfully!")
	print(f"🔗 View your dataset: https://huggingface.co/datasets/{REPO_ID}")
	print(f"📋 Next steps:")
	print(f" - Review the dataset card")
	print(f" - Test downloading components")
	print(f" - Share with the community!")
	else:
	print("\n💥 Upload failed - check error messages above")

	if __name__ == "__main__":
	main()