{ "cells": [ { "cell_type": "markdown", "id": "f075cbbd-9f85-4cc6-b8cb-da5fc4f1c8c9", "metadata": {}, "source": [ "# Scalable Contract Generator for AI Testing - COMPLETE IMPLEMENTATION\n", "\n", "### Generates 200+ contracts with actual PDF/DOCX content and proper folder structure\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "18d2c999-0e2d-49f0-8d0e-25625306d217", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸŽÆ Scalable Contract Dataset Generator for AI Testing\n", "============================================================\n", "āœ… Created organized folder structure\n", "šŸŽÆ Initialized Scalable Generator for 200 contracts\n", "šŸ“ Dataset structure: ../data/sample_data\n", "šŸš€ Generating 200 contracts...\n", "============================================================\n", "\n", "šŸ“Š Generating 80 low_risk contracts:\n", "----------------------------------------\n", " āœ… Progress: 50/200\n", "\n", "šŸ“Š Generating 70 medium_risk contracts:\n", "----------------------------------------\n", " āœ… Progress: 100/200\n", " āœ… Progress: 150/200\n", "\n", "šŸ“Š Generating 50 high_risk contracts:\n", "----------------------------------------\n", " āœ… Progress: 200/200\n", "\n", "šŸ“Š Creating dataset splits...\n", " āœ… train: 140 contracts\n", " āœ… test: 40 contracts\n", " āœ… validation: 20 contracts\n", "\n", "============================================================\n", "šŸ“Š DATASET GENERATION COMPLETE!\n", "============================================================\n", "šŸ“ Dataset Location: ../data/sample_data\n", "šŸ“„ Total Contracts: 200\n", "šŸŽÆ Risk Distribution:\n", " • low_risk: 80 contracts\n", " • medium_risk: 70 contracts\n", " • high_risk: 50 contracts\n", "šŸ“ Format Distribution:\n", " • PDF: 103 contracts\n", " • DOCX: 97 contracts\n", "šŸ“‘ Contract Types: 30 unique types\n", "šŸ”€ Dataset Splits:\n", " • Training: 70%\n", " • Testing: 20%\n", " • Validation: 10%\n", "šŸ“ˆ Perfect for supervised AI training! šŸš€\n", "\n", "āœ… Dataset ready for AI training!\n", "šŸ“ Location: ../data/sample_data\n" ] } ], "source": [ "# DEPENDENCIES\n", "import os\n", "import random\n", "from datetime import datetime, timedelta\n", "from pathlib import Path\n", "from faker import Faker\n", "from docx import Document\n", "from docx.shared import Inches\n", "from reportlab.lib.pagesizes import LETTER\n", "from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak\n", "from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\n", "from reportlab.lib import colors\n", "from reportlab.lib.units import inch\n", "import json\n", "\n", "class ScalableContractGenerator:\n", " def __init__(self, total_contracts=200):\n", " self.fake = Faker()\n", " self.total_contracts = total_contracts\n", " self.base_dir = Path(\"../data/sample_data/\")\n", " self.setup_folder_structure()\n", " \n", " # Expanded contract types\n", " self.contract_types = [\n", " 'employment_agreement', 'commercial_lease', 'residential_lease',\n", " 'business_partnership', 'consulting_services', 'loan_agreement', \n", " 'software_license', 'confidentiality_agreement', 'asset_purchase',\n", " 'freelance_contract', 'equipment_lease', 'service_maintenance',\n", " 'construction_contract', 'sales_agreement', 'distribution_agreement',\n", " 'joint_venture', 'professional_services', 'technology_license',\n", " 'marketing_agreement', 'subscription_service', 'independent_contractor',\n", " 'non_compete_agreement', 'intellectual_property', 'project_management',\n", " 'supply_agreement', 'franchise_agreement', 'merger_agreement',\n", " 'severance_agreement', 'settlement_agreement', 'sponsorship_agreement'\n", " ]\n", " \n", " # Risk levels with distribution weights\n", " self.risk_levels = {\n", " 'low_risk': 0.4, # 40% of contracts\n", " 'medium_risk': 0.35, # 35% of contracts \n", " 'high_risk': 0.25 # 25% of contracts\n", " }\n", " \n", " # Document formats\n", " self.formats = ['pdf', 'docx']\n", " \n", " # Enhanced risk variations\n", " self.risk_clauses = self._initialize_risk_clauses()\n", " \n", " print(f\"šŸŽÆ Initialized Scalable Generator for {total_contracts} contracts\")\n", " print(f\"šŸ“ Dataset structure: {self.base_dir}\")\n", "\n", " def setup_folder_structure(self):\n", " \"\"\"Create organized folder structure\"\"\"\n", " folders = [\n", " 'contracts/pdf',\n", " 'contracts/docx', \n", " 'annotations',\n", " 'train_split',\n", " 'test_split',\n", " 'validation_split',\n", " 'reports'\n", " ]\n", " \n", " for folder in folders:\n", " (self.base_dir / folder).mkdir(parents=True, exist_ok=True)\n", " \n", " print(\"āœ… Created organized folder structure\")\n", "\n", " def _initialize_risk_clauses(self):\n", " \"\"\"Initialize comprehensive risk clause variations\"\"\"\n", " return {\n", " 'liability': {\n", " 'low_risk': [\n", " \"Liability shall be limited to the amount paid under this agreement.\",\n", " \"Total liability shall not exceed the contract value.\",\n", " \"Liability is capped at direct damages only.\",\n", " \"Maximum liability limited to insurance coverage amounts.\",\n", " \"No liability for indirect or consequential damages.\"\n", " ],\n", " 'medium_risk': [\n", " \"Liability limited to direct damages up to twice the contract value.\",\n", " \"No liability for indirect damages except for gross negligence.\",\n", " \"Liability capped at three times the annual contract value.\",\n", " \"Limited liability for third-party claims.\",\n", " \"Liability exclusions for force majeure events.\"\n", " ],\n", " 'high_risk': [\n", " \"Unlimited liability for all damages arising from this agreement.\",\n", " \"Parties assume full liability for all direct and consequential damages.\",\n", " \"No limitation of liability applies to any claims under this agreement.\",\n", " \"Liability includes punitive damages and all legal expenses.\",\n", " \"Complete assumption of all risks and liabilities.\"\n", " ]\n", " },\n", " 'termination': {\n", " 'low_risk': [\n", " \"Either party may terminate with 30 days written notice for convenience.\",\n", " \"Termination requires mutual agreement or material breach.\",\n", " \"Parties may terminate for cause with cure period of 30 days.\",\n", " \"Termination only for specified material breaches.\",\n", " \"Mutual termination rights with notice periods.\"\n", " ],\n", " 'medium_risk': [\n", " \"Termination permitted with 15 days notice for any reason.\",\n", " \"Immediate termination for breach of payment terms.\",\n", " \"Termination for convenience with 50% early termination fee.\",\n", " \"Termination for insolvency or change of control.\",\n", " \"Limited termination rights with penalties.\"\n", " ],\n", " 'high_risk': [\n", " \"Either party may terminate immediately without cause.\",\n", " \"Termination at will with no notice requirement.\", \n", " \"Immediate termination for any breach, however minor.\",\n", " \"Unilateral termination rights without penalty.\",\n", " \"Termination for subjective dissatisfaction.\"\n", " ]\n", " },\n", " 'indemnification': {\n", " 'low_risk': [\n", " \"Indemnification limited to third-party claims arising from negligence.\",\n", " \"Mutual indemnification for intellectual property infringement.\",\n", " \"Indemnification capped at contract value.\",\n", " \"Proportional indemnification based on fault.\",\n", " \"Standard indemnification for direct damages only.\"\n", " ],\n", " 'medium_risk': [\n", " \"One-way indemnification favoring the client.\",\n", " \"Indemnification includes legal fees and settlement costs.\",\n", " \"Indemnification for all claims related to services provided.\",\n", " \"Broad indemnification with some limitations.\",\n", " \"Indemnification for regulatory violations.\"\n", " ],\n", " 'high_risk': [\n", " \"Unlimited indemnification for all claims and damages.\",\n", " \"Indemnification includes punitive damages and all legal expenses.\",\n", " \"Broad indemnification covering all business activities.\",\n", " \"Indemnification for all losses regardless of cause.\",\n", " \"Complete hold harmless agreement.\"\n", " ]\n", " },\n", " 'warranty': {\n", " 'low_risk': [\n", " \"Warranties limited to those expressly stated in this agreement.\",\n", " \"No implied warranties, including merchantability or fitness.\",\n", " \"As-is basis with all faults, no additional warranties.\",\n", " \"Limited warranty for workmanship and materials.\",\n", " \"Standard industry warranties apply.\"\n", " ],\n", " 'medium_risk': [\n", " \"Implied warranties limited to 90 days from effective date.\",\n", " \"Warranties exclude normal wear and tear.\",\n", " \"Limited warranty for specific components only.\",\n", " \"Warranty limitations for consumable items.\",\n", " \"Modified warranty terms with exceptions.\"\n", " ],\n", " 'high_risk': [\n", " \"No warranties of any kind, express or implied.\",\n", " \"Services provided 'as-is' without any performance guarantees.\",\n", " \"All warranties disclaimed to maximum extent permitted by law.\",\n", " \"No warranty of fitness for particular purpose.\",\n", " \"Complete disclaimer of all representations and warranties.\"\n", " ]\n", " },\n", " 'confidentiality': {\n", " 'low_risk': [\n", " \"Confidentiality obligations survive for 2 years post-termination.\",\n", " \"Standard confidentiality with reasonable protection measures.\",\n", " \"Mutual confidentiality with standard exceptions.\",\n", " \"Confidentiality for specifically marked information only.\",\n", " \"Standard non-disclosure terms apply.\"\n", " ],\n", " 'medium_risk': [\n", " \"Confidentiality perpetual for trade secrets, 5 years for other information.\",\n", " \"Heightened confidentiality with specific security requirements.\",\n", " \"One-way confidentiality favoring disclosing party.\",\n", " \"Confidentiality for all business information shared.\",\n", " \"Enhanced protection for sensitive data.\"\n", " ],\n", " 'high_risk': [\n", " \"Perpetual confidentiality for all information.\",\n", " \"No right to use residual knowledge or general skills.\",\n", " \"Confidentiality extends to all business information regardless of marking.\",\n", " \"Lifetime confidentiality obligations.\",\n", " \"Complete prohibition on use of confidential information.\"\n", " ]\n", " }\n", " }\n", "\n", " def generate_contract_dataset(self):\n", " \"\"\"Generate comprehensive contract dataset\"\"\"\n", " print(f\"šŸš€ Generating {self.total_contracts} contracts...\")\n", " print(\"=\" * 60)\n", " \n", " dataset_stats = {\n", " 'total_contracts': 0,\n", " 'by_risk_level': {'low_risk': 0, 'medium_risk': 0, 'high_risk': 0},\n", " 'by_format': {'pdf': 0, 'docx': 0},\n", " 'by_type': {}\n", " }\n", " \n", " contracts_per_risk = self._calculate_contract_distribution()\n", " \n", " for risk_level, count in contracts_per_risk.items():\n", " print(f\"\\nšŸ“Š Generating {count} {risk_level} contracts:\")\n", " print(\"-\" * 40)\n", " \n", " for i in range(count):\n", " contract_type = random.choice(self.contract_types)\n", " doc_format = random.choice(self.formats)\n", " \n", " try:\n", " # Generate contract\n", " contract_data = self._generate_contract_data(contract_type, risk_level)\n", " \n", " if doc_format == 'pdf':\n", " file_path = self.generate_pdf_contract(contract_type, contract_data)\n", " else:\n", " file_path = self.generate_docx_contract(contract_type, contract_data)\n", " \n", " if file_path:\n", " # Save annotation\n", " self._save_annotation(contract_data, file_path)\n", " \n", " # Update stats\n", " dataset_stats['total_contracts'] += 1\n", " dataset_stats['by_risk_level'][risk_level] += 1\n", " dataset_stats['by_format'][doc_format] += 1\n", " dataset_stats['by_type'][contract_type] = dataset_stats['by_type'].get(contract_type, 0) + 1\n", " \n", " if dataset_stats['total_contracts'] % 50 == 0:\n", " print(f\" āœ… Progress: {dataset_stats['total_contracts']}/{self.total_contracts}\")\n", " \n", " except Exception as e:\n", " print(f\" āŒ Error generating contract {i+1}: {e}\")\n", " continue\n", " \n", " # Create dataset splits\n", " self._create_dataset_splits()\n", " \n", " # Generate comprehensive report\n", " self._generate_dataset_report(dataset_stats)\n", " \n", " return dataset_stats\n", "\n", " def _calculate_contract_distribution(self):\n", " \"\"\"Calculate how many contracts to generate for each risk level\"\"\"\n", " contracts_per_risk = {}\n", " for risk_level, weight in self.risk_levels.items():\n", " contracts_per_risk[risk_level] = int(self.total_contracts * weight)\n", " \n", " # Adjust for rounding\n", " total_allocated = sum(contracts_per_risk.values())\n", " if total_allocated < self.total_contracts:\n", " contracts_per_risk['medium_risk'] += (self.total_contracts - total_allocated)\n", " \n", " return contracts_per_risk\n", "\n", " def _generate_contract_data(self, contract_type, risk_level):\n", " \"\"\"Generate comprehensive contract data with risk annotations\"\"\"\n", " base_data = {\n", " 'contract_id': f\"CT-{self.fake.unique.random_number(digits=8)}\",\n", " 'effective_date': self.fake.date_between(start_date='-30d', end_date='+30d').strftime('%B %d, %Y'),\n", " 'execution_date': self.fake.date_between(start_date='-60d', end_date='-1d').strftime('%B %d, %Y'),\n", " 'generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n", " 'contract_type': contract_type,\n", " 'risk_level': risk_level,\n", " 'risk_annotations': {},\n", " 'clauses_used': {},\n", " 'parties': [],\n", " 'metadata': {}\n", " }\n", " \n", " # Generate risk-appropriate clauses\n", " for clause_type in self.risk_clauses.keys():\n", " base_data['clauses_used'][clause_type] = random.choice(\n", " self.risk_clauses[clause_type][risk_level]\n", " )\n", " \n", " # Add risk annotations\n", " base_data['risk_annotations'] = self._calculate_risk_scores(risk_level)\n", " \n", " # Generate contract-specific content\n", " base_data.update(self._generate_contract_specific_data(contract_type))\n", " \n", " return base_data\n", "\n", " def _calculate_risk_scores(self, risk_level):\n", " \"\"\"Calculate detailed risk scores\"\"\"\n", " risk_base_scores = {\n", " 'low_risk': {\n", " 'liability_risk': random.uniform(0.1, 0.3),\n", " 'termination_risk': random.uniform(0.2, 0.4),\n", " 'indemnification_risk': random.uniform(0.1, 0.3),\n", " 'warranty_risk': random.uniform(0.3, 0.5),\n", " 'confidentiality_risk': random.uniform(0.2, 0.4)\n", " },\n", " 'medium_risk': {\n", " 'liability_risk': random.uniform(0.4, 0.6),\n", " 'termination_risk': random.uniform(0.5, 0.7),\n", " 'indemnification_risk': random.uniform(0.6, 0.8),\n", " 'warranty_risk': random.uniform(0.4, 0.6),\n", " 'confidentiality_risk': random.uniform(0.5, 0.7)\n", " },\n", " 'high_risk': {\n", " 'liability_risk': random.uniform(0.7, 0.9),\n", " 'termination_risk': random.uniform(0.6, 0.8),\n", " 'indemnification_risk': random.uniform(0.7, 0.95),\n", " 'warranty_risk': random.uniform(0.6, 0.8),\n", " 'confidentiality_risk': random.uniform(0.7, 0.9)\n", " }\n", " }\n", " \n", " scores = risk_base_scores[risk_level].copy()\n", " scores['overall_risk'] = sum(scores.values()) / len(scores)\n", " \n", " return scores\n", "\n", " def _generate_contract_specific_data(self, contract_type):\n", " \"\"\"Generate data specific to contract type\"\"\"\n", " if contract_type == 'employment_agreement':\n", " return self._generate_employment_data()\n", " elif 'lease' in contract_type:\n", " return self._generate_lease_data(contract_type)\n", " elif 'loan' in contract_type:\n", " return self._generate_loan_data()\n", " else:\n", " return self._generate_general_business_data(contract_type)\n", "\n", " def _generate_employment_data(self):\n", " \"\"\"Generate employment agreement specific data\"\"\"\n", " company = f\"{self.fake.company()} {random.choice(['Inc.', 'LLC', 'Corp.', 'Ltd.'])}\"\n", " employee = self.fake.name()\n", " \n", " return {\n", " 'company': company,\n", " 'employee': employee,\n", " 'position': random.choice([\n", " \"Senior Software Engineer\", \"Marketing Director\", \"Financial Analyst\",\n", " \"Operations Manager\", \"Product Manager\", \"Sales Executive\"\n", " ]),\n", " 'department': random.choice([\"Technology\", \"Marketing\", \"Finance\", \"Operations\", \"Sales\"]),\n", " 'salary': f\"${random.randint(60000, 180000):,} per annum\",\n", " 'start_date': self.fake.date_between(start_date='+5d', end_date='+30d').strftime('%B %d, %Y'),\n", " 'duration': random.choice([\"One year\", \"Two years\", \"Three years\", \"At-will\"]),\n", " 'benefits': random.choice([\n", " \"Standard health insurance and 401(k) matching\",\n", " \"Comprehensive benefits package including stock options\",\n", " \"Full medical, dental, vision, and retirement benefits\"\n", " ]),\n", " 'parties': [\n", " {'name': company, 'role': 'Employer', 'signatory': f\"{self.fake.name()}, CEO\"},\n", " {'name': employee, 'role': 'Employee', 'signatory': employee}\n", " ]\n", " }\n", "\n", " def _generate_lease_data(self, lease_type):\n", " \"\"\"Generate lease agreement specific data\"\"\"\n", " landlord = f\"{self.fake.company()} Properties\"\n", " tenant = f\"{self.fake.company()} {random.choice(['Retail', 'Office', 'Industrial', 'Commercial'])}\"\n", " is_commercial = 'commercial' in lease_type\n", " \n", " return {\n", " 'property_address': f\"{random.randint(100, 999)} {random.choice(['Main', 'Broadway', 'Market', 'Commerce'])} Street, {self.fake.city()}, {self.fake.state_abbr()} {self.fake.zipcode()}\",\n", " 'property_type': \"Commercial Space\" if is_commercial else \"Residential Property\",\n", " 'square_footage': f\"{random.randint(800, 5000)} square feet\",\n", " 'landlord': landlord,\n", " 'tenant': tenant,\n", " 'monthly_rent': f\"${random.randint(1500, 15000) if is_commercial else random.randint(1000, 5000):,}\",\n", " 'lease_term': f\"{random.randint(12, 60)} months\",\n", " 'parties': [\n", " {'name': landlord, 'role': 'Landlord', 'signatory': f\"{self.fake.name()}, Property Manager\"},\n", " {'name': tenant, 'role': 'Tenant', 'signatory': f\"{self.fake.name()}, President\"}\n", " ]\n", " }\n", "\n", " def _generate_loan_data(self):\n", " \"\"\"Generate loan agreement specific data\"\"\"\n", " lender = f\"{self.fake.company()} {random.choice(['Bank', 'Credit', 'Financial', 'Capital'])}\"\n", " borrower = self.fake.name()\n", " \n", " return {\n", " 'lender': lender,\n", " 'borrower': borrower,\n", " 'loan_amount': f\"${random.randint(25000, 500000):,}\",\n", " 'interest_rate': f\"{random.uniform(3.5, 12.5):.2f}%\",\n", " 'term_months': random.randint(12, 84),\n", " 'purpose': random.choice([\n", " \"Business expansion and working capital\",\n", " \"Equipment purchase and facility upgrade\", \n", " \"Debt consolidation and operational funding\",\n", " \"Real estate investment and development\"\n", " ]),\n", " 'parties': [\n", " {'name': lender, 'role': 'Lender', 'signatory': f\"{self.fake.name()}, Vice President\"},\n", " {'name': borrower, 'role': 'Borrower', 'signatory': borrower}\n", " ]\n", " }\n", "\n", " def _generate_general_business_data(self, contract_type):\n", " \"\"\"Generate data for general business contracts\"\"\"\n", " party1 = f\"{self.fake.company()} {random.choice(['Inc.', 'LLC', 'Corp.'])}\"\n", " party2 = f\"{self.fake.company()} {random.choice(['Solutions', 'Services', 'Group', 'Partners'])}\"\n", " \n", " return {\n", " 'party1': party1,\n", " 'party2': party2,\n", " 'agreement_purpose': f\"Business collaboration for {contract_type.replace('_', ' ')}\",\n", " 'term': f\"{random.randint(6, 36)} months\",\n", " 'parties': [\n", " {'name': party1, 'role': 'First Party', 'signatory': f\"{self.fake.name()}, Authorized Signatory\"},\n", " {'name': party2, 'role': 'Second Party', 'signatory': f\"{self.fake.name()}, Authorized Signatory\"}\n", " ]\n", " }\n", "\n", " def generate_docx_contract(self, contract_type, data):\n", " \"\"\"Generate actual DOCX contract with content\"\"\"\n", " try:\n", " doc = Document()\n", " \n", " # Title\n", " title = doc.add_heading(f\"{contract_type.replace('_', ' ').title()}\", 0)\n", " doc.add_paragraph(f\"Contract ID: {data['contract_id']}\")\n", " doc.add_paragraph(f\"Effective Date: {data['effective_date']}\")\n", " doc.add_paragraph(f\"Execution Date: {data['execution_date']}\")\n", " doc.add_paragraph(f\"Risk Level: {data['risk_level'].replace('_', ' ').title()}\")\n", " \n", " doc.add_paragraph() # Empty line\n", " \n", " # Parties Section\n", " doc.add_heading(\"PARTIES\", level=1)\n", " for party in data['parties']:\n", " doc.add_paragraph(f\"{party['role']}: {party['name']}\")\n", " \n", " doc.add_paragraph() # Empty line\n", " \n", " # Recitals\n", " doc.add_heading(\"RECITALS\", level=1)\n", " doc.add_paragraph(\"WHEREAS, the Parties desire to enter into this Agreement to set forth the terms and conditions of their relationship;\")\n", " doc.add_paragraph(\"WHEREAS, each Party has the requisite power and authority to enter into this Agreement;\")\n", " doc.add_paragraph(\"WHEREAS, the Parties intend to be legally bound by the terms herein;\")\n", " doc.add_paragraph(\"NOW, THEREFORE, in consideration of the mutual covenants contained herein, the Parties agree as follows:\")\n", " \n", " doc.add_paragraph() # Empty line\n", " \n", " # Agreement Terms\n", " doc.add_heading(\"AGREEMENT\", level=1)\n", " \n", " # Contract-specific content\n", " if contract_type == 'employment_agreement':\n", " self._add_employment_content_docx(doc, data)\n", " elif 'lease' in contract_type:\n", " self._add_lease_content_docx(doc, data)\n", " elif contract_type == 'loan_agreement':\n", " self._add_loan_content_docx(doc, data)\n", " else:\n", " self._add_general_content_docx(doc, data)\n", " \n", " # Risk clauses\n", " self._add_risk_clauses_docx(doc, data)\n", " \n", " # Standard provisions\n", " self._add_standard_provisions_docx(doc, data)\n", " \n", " # Signature section\n", " self._add_signature_section_docx(doc, data)\n", " \n", " # Save document\n", " filename = self.base_dir / f\"contracts/docx/{contract_type}_{data['risk_level']}_{data['contract_id']}.docx\"\n", " doc.save(filename)\n", " \n", " return filename\n", " \n", " except Exception as e:\n", " print(f\"āŒ Error generating DOCX: {e}\")\n", " return None\n", "\n", " def generate_pdf_contract(self, contract_type, data):\n", " \"\"\"Generate actual PDF contract with content\"\"\"\n", " try:\n", " filename = self.base_dir / f\"contracts/pdf/{contract_type}_{data['risk_level']}_{data['contract_id']}.pdf\"\n", " doc = SimpleDocTemplate(str(filename), pagesize=LETTER, \n", " topMargin=1*inch, bottomMargin=1*inch,\n", " leftMargin=1*inch, rightMargin=1*inch)\n", " styles = getSampleStyleSheet()\n", " story = []\n", " \n", " # Title\n", " title_style = ParagraphStyle(\n", " 'CustomTitle',\n", " parent=styles['Heading1'],\n", " fontSize=16,\n", " spaceAfter=30,\n", " alignment=1,\n", " textColor=colors.darkblue\n", " )\n", " \n", " story.append(Paragraph(f\"{contract_type.replace('_', ' ').title()}\", title_style))\n", " story.append(Paragraph(f\"Contract ID: {data['contract_id']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Effective Date: {data['effective_date']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Risk Level: {data['risk_level'].replace('_', ' ').title()}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 0.25*inch))\n", " \n", " # Parties\n", " story.append(Paragraph(\"PARTIES\", styles[\"Heading2\"]))\n", " for party in data['parties']:\n", " story.append(Paragraph(f\"{party['role']}: {party['name']}\", styles[\"Normal\"]))\n", " \n", " story.append(Spacer(1, 0.2*inch))\n", " \n", " # Recitals\n", " story.append(Paragraph(\"RECITALS\", styles[\"Heading2\"]))\n", " recitals = [\n", " \"WHEREAS, the Parties desire to enter into this Agreement to set forth the terms and conditions of their relationship;\",\n", " \"WHEREAS, each Party has the requisite power and authority to enter into this Agreement;\",\n", " \"WHEREAS, the Parties intend to be legally bound by the terms herein;\",\n", " \"NOW, THEREFORE, in consideration of the mutual covenants contained herein, the Parties agree as follows:\"\n", " ]\n", " for recital in recitals:\n", " story.append(Paragraph(recital, styles[\"Normal\"]))\n", " \n", " story.append(Spacer(1, 0.2*inch))\n", " \n", " # Agreement\n", " story.append(Paragraph(\"AGREEMENT\", styles[\"Heading2\"]))\n", " \n", " if contract_type == 'employment_agreement':\n", " self._add_employment_content_pdf(story, data, styles)\n", " elif 'lease' in contract_type:\n", " self._add_lease_content_pdf(story, data, styles)\n", " elif contract_type == 'loan_agreement':\n", " self._add_loan_content_pdf(story, data, styles)\n", " else:\n", " self._add_general_content_pdf(story, data, styles)\n", " \n", " # Risk clauses\n", " self._add_risk_clauses_pdf(story, data, styles)\n", " \n", " # Standard provisions\n", " self._add_standard_provisions_pdf(story, data, styles)\n", " \n", " # Signature section\n", " self._add_signature_section_pdf(story, data, styles)\n", " \n", " doc.build(story)\n", " return filename\n", " \n", " except Exception as e:\n", " print(f\"āŒ Error generating PDF: {e}\")\n", " return None\n", "\n", " # DOCX Content Methods\n", " def _add_employment_content_docx(self, doc, data):\n", " doc.add_heading(\"EMPLOYMENT TERMS\", level=2)\n", " doc.add_paragraph(f\"Position: {data['position']}\")\n", " doc.add_paragraph(f\"Department: {data['department']}\")\n", " doc.add_paragraph(f\"Salary: {data['salary']}\")\n", " doc.add_paragraph(f\"Start Date: {data['start_date']}\")\n", " doc.add_paragraph(f\"Duration: {data['duration']}\")\n", " doc.add_paragraph(f\"Benefits: {data['benefits']}\")\n", "\n", " def _add_lease_content_docx(self, doc, data):\n", " doc.add_heading(\"LEASE TERMS\", level=2)\n", " doc.add_paragraph(f\"Property Address: {data['property_address']}\")\n", " doc.add_paragraph(f\"Property Type: {data['property_type']}\")\n", " doc.add_paragraph(f\"Square Footage: {data['square_footage']}\")\n", " doc.add_paragraph(f\"Monthly Rent: {data['monthly_rent']}\")\n", " doc.add_paragraph(f\"Lease Term: {data['lease_term']}\")\n", "\n", " def _add_loan_content_docx(self, doc, data):\n", " doc.add_heading(\"LOAN TERMS\", level=2)\n", " doc.add_paragraph(f\"Loan Amount: {data['loan_amount']}\")\n", " doc.add_paragraph(f\"Interest Rate: {data['interest_rate']}\")\n", " doc.add_paragraph(f\"Term: {data['term_months']} months\")\n", " doc.add_paragraph(f\"Purpose: {data['purpose']}\")\n", "\n", " def _add_general_content_docx(self, doc, data):\n", " doc.add_heading(\"TERMS AND CONDITIONS\", level=2)\n", " doc.add_paragraph(f\"Agreement Purpose: {data['agreement_purpose']}\")\n", " doc.add_paragraph(f\"Term: {data['term']}\")\n", "\n", " def _add_risk_clauses_docx(self, doc, data):\n", " doc.add_heading(\"STANDARD PROVISIONS\", level=2)\n", " for clause_type, clause_text in data['clauses_used'].items():\n", " p = doc.add_paragraph()\n", " p.add_run(f\"{clause_type.replace('_', ' ').title()}: \").bold = True\n", " p.add_run(clause_text)\n", "\n", " def _add_standard_provisions_docx(self, doc, data):\n", " doc.add_heading(\"ADDITIONAL PROVISIONS\", level=2)\n", " provisions = [\n", " \"This Agreement constitutes the entire understanding between the Parties.\",\n", " \"No modification shall be effective unless in writing signed by both Parties.\",\n", " \"The failure to enforce any provision shall not constitute a waiver.\",\n", " \"If any provision is invalid, the remaining provisions shall continue in effect.\",\n", " \"This Agreement may be executed in counterparts.\"\n", " ]\n", " for provision in provisions:\n", " doc.add_paragraph(f\"• {provision}\", style='List Bullet')\n", "\n", " def _add_signature_section_docx(self, doc, data):\n", " doc.add_heading(\"IN WITNESS WHEREOF\", level=2)\n", " doc.add_paragraph(\"The Parties have executed this Agreement as of the date first written above.\")\n", " doc.add_paragraph()\n", " \n", " for party in data['parties']:\n", " doc.add_paragraph(\"_________________________\")\n", " doc.add_paragraph(party['signatory'])\n", " doc.add_paragraph(party['role'])\n", " doc.add_paragraph(f\"Date: _________________________\")\n", " doc.add_paragraph()\n", "\n", " # PDF Content Methods\n", " def _add_employment_content_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"EMPLOYMENT TERMS\", styles[\"Heading2\"]))\n", " story.append(Paragraph(f\"Position: {data['position']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Department: {data['department']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Salary: {data['salary']}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 12))\n", "\n", " def _add_lease_content_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"LEASE TERMS\", styles[\"Heading2\"]))\n", " story.append(Paragraph(f\"Property Address: {data['property_address']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Monthly Rent: {data['monthly_rent']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Lease Term: {data['lease_term']}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 12))\n", "\n", " def _add_loan_content_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"LOAN TERMS\", styles[\"Heading2\"]))\n", " story.append(Paragraph(f\"Loan Amount: {data['loan_amount']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Interest Rate: {data['interest_rate']}\", styles[\"Normal\"]))\n", " story.append(Paragraph(f\"Term: {data['term_months']} months\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 12))\n", "\n", " def _add_general_content_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"TERMS AND CONDITIONS\", styles[\"Heading2\"]))\n", " story.append(Paragraph(f\"Agreement Purpose: {data['agreement_purpose']}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 12))\n", "\n", " def _add_risk_clauses_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"STANDARD PROVISIONS\", styles[\"Heading2\"]))\n", " for clause_type, clause_text in data['clauses_used'].items():\n", " story.append(Paragraph(f\"{clause_type.replace('_', ' ').title()}: {clause_text}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 6))\n", "\n", " def _add_standard_provisions_pdf(self, story, data, styles):\n", " story.append(Paragraph(\"ADDITIONAL PROVISIONS\", styles[\"Heading2\"]))\n", " provisions = [\n", " \"This Agreement constitutes the entire understanding between the Parties.\",\n", " \"No modification shall be effective unless in writing signed by both Parties.\",\n", " \"The failure to enforce any provision shall not constitute a waiver.\",\n", " \"If any provision is invalid, the remaining provisions shall continue in effect.\"\n", " ]\n", " for provision in provisions:\n", " story.append(Paragraph(f\"• {provision}\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 3))\n", "\n", " def _add_signature_section_pdf(self, story, data, styles):\n", " story.append(Spacer(1, 24))\n", " story.append(Paragraph(\"IN WITNESS WHEREOF\", styles[\"Heading2\"]))\n", " story.append(Paragraph(\"The Parties have executed this Agreement as of the date first written above.\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 24))\n", " \n", " for party in data['parties']:\n", " story.append(Paragraph(\"_________________________\", styles[\"Normal\"]))\n", " story.append(Paragraph(party['signatory'], styles[\"Normal\"]))\n", " story.append(Paragraph(party['role'], styles[\"Normal\"]))\n", " story.append(Paragraph(\"Date: _________________________\", styles[\"Normal\"]))\n", " story.append(Spacer(1, 24))\n", "\n", " def _save_annotation(self, contract_data, file_path):\n", " \"\"\"Save annotation JSON file\"\"\"\n", " annotation_data = {\n", " 'contract_id': contract_data['contract_id'],\n", " 'filename': file_path.name,\n", " 'file_path': str(file_path),\n", " 'contract_type': contract_data['contract_type'],\n", " 'risk_level': contract_data['risk_level'],\n", " 'risk_scores': contract_data['risk_annotations'],\n", " 'clauses_used': contract_data['clauses_used'],\n", " 'parties': contract_data['parties'],\n", " 'generation_date': contract_data['generation_date'],\n", " 'metadata': contract_data.get('metadata', {})\n", " }\n", " \n", " annotation_file = self.base_dir / f\"annotations/{contract_data['contract_id']}.json\"\n", " with open(annotation_file, 'w') as f:\n", " json.dump(annotation_data, f, indent=2)\n", "\n", " def _create_dataset_splits(self):\n", " \"\"\"Create train/test/validation splits\"\"\"\n", " print(\"\\nšŸ“Š Creating dataset splits...\")\n", " \n", " # Get all annotation files\n", " annotation_files = list((self.base_dir / 'annotations').glob('*.json'))\n", " random.shuffle(annotation_files)\n", " \n", " # Split ratios\n", " train_ratio, test_ratio, val_ratio = 0.7, 0.2, 0.1\n", " n_total = len(annotation_files)\n", " \n", " n_train = int(n_total * train_ratio)\n", " n_test = int(n_total * test_ratio)\n", " n_val = n_total - n_train - n_test\n", " \n", " splits = {\n", " 'train': annotation_files[:n_train],\n", " 'test': annotation_files[n_train:n_train + n_test],\n", " 'validation': annotation_files[n_train + n_test:]\n", " }\n", " \n", " # Create split files\n", " for split_name, files in splits.items():\n", " split_file = self.base_dir / f\"{split_name}_split/split.json\"\n", " file_list = [f.name for f in files]\n", " with open(split_file, 'w') as f:\n", " json.dump(file_list, f, indent=2)\n", " \n", " print(f\" āœ… {split_name}: {len(files)} contracts\")\n", "\n", " def _generate_dataset_report(self, stats):\n", " \"\"\"Generate comprehensive dataset report\"\"\"\n", " report = {\n", " 'generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n", " 'total_contracts': stats['total_contracts'],\n", " 'risk_distribution': stats['by_risk_level'],\n", " 'format_distribution': stats['by_format'],\n", " 'contract_type_distribution': stats['by_type'],\n", " 'folder_structure': {\n", " 'contracts_pdf': len(list((self.base_dir / 'contracts/pdf').glob('*.pdf'))),\n", " 'contracts_docx': len(list((self.base_dir / 'contracts/docx').glob('*.docx'))),\n", " 'annotations': len(list((self.base_dir / 'annotations').glob('*.json'))),\n", " }\n", " }\n", " \n", " report_file = self.base_dir / 'reports/dataset_report.json'\n", " with open(report_file, 'w') as f:\n", " json.dump(report, f, indent=2)\n", " \n", " # Print summary\n", " print(\"\\n\" + \"=\" * 60)\n", " print(\"šŸ“Š DATASET GENERATION COMPLETE!\")\n", " print(\"=\" * 60)\n", " print(f\"šŸ“ Dataset Location: {self.base_dir}\")\n", " print(f\"šŸ“„ Total Contracts: {stats['total_contracts']}\")\n", " print(f\"šŸŽÆ Risk Distribution:\")\n", " for risk_level, count in stats['by_risk_level'].items():\n", " print(f\" • {risk_level}: {count} contracts\")\n", " print(f\"šŸ“ Format Distribution:\")\n", " for format_type, count in stats['by_format'].items():\n", " print(f\" • {format_type.upper()}: {count} contracts\")\n", " print(f\"šŸ“‘ Contract Types: {len(stats['by_type'])} unique types\")\n", " print(f\"šŸ”€ Dataset Splits:\")\n", " print(f\" • Training: 70%\")\n", " print(f\" • Testing: 20%\") \n", " print(f\" • Validation: 10%\")\n", " print(f\"šŸ“ˆ Perfect for supervised AI training! šŸš€\")\n", "\n", "def main():\n", " \"\"\"Main function to generate the dataset\"\"\"\n", " print(\"šŸŽÆ Scalable Contract Dataset Generator for AI Testing\")\n", " print(\"=\" * 60)\n", " \n", " # Generate 200 contracts for proper supervised learning\n", " generator = ScalableContractGenerator(total_contracts=200)\n", " \n", " # Generate the complete dataset\n", " dataset_stats = generator.generate_contract_dataset()\n", " \n", " print(f\"\\nāœ… Dataset ready for AI training!\")\n", " print(f\"šŸ“ Location: {generator.base_dir}\")\n", "\n", "if __name__ == \"__main__\":\n", " main()\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "24be2b8d-d765-4f29-b2f3-f015d8145d5a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }