File size: 10,251 Bytes
73e99c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
"""
Extract and Organize Audrey Berger Welz's Original Material
Prioritizes her original writing and identifies what's truly hers
"""

import re
from pathlib import Path
from typing import Dict, List, Set
from collections import defaultdict
import json
from datetime import datetime

class AudreyMaterialExtractor:
    """Extract and organize Audrey's original material"""
    
    def __init__(self):
        self.manuscripts_dir = Path("manuscripts")
        self.audrey_material_dir = Path("manuscripts/Shadow_of_Lillya/audrey_original")
        self.audrey_material_dir.mkdir(parents=True, exist_ok=True)
    
    def load_all_versions(self) -> Dict[str, Dict]:
        """Load all versions of the manuscript"""
        versions = {}
        
        # Load edited version (sent to Tyson - most likely to be Audrey's)
        edited_dir = self.manuscripts_dir / "Shadow_of_Lillya" / "edited_version"
        for md_file in edited_dir.glob("*.md"):
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
                versions['edited_tyson'] = {
                    'content': content,
                    'source': str(md_file),
                    'priority': 1,  # Highest priority - sent to editor
                    'date': '2020-03-12',  # From filename
                    'description': 'Rough Draft sent to Tyson Cornell at Rare Bird Books'
                }
        
        # Load unedited material (earlier versions - likely more of Audrey's work)
        unedited_dir = self.manuscripts_dir / "Shadow_of_Lillya" / "unedited_material"
        version_num = 2
        
        for md_file in sorted(unedited_dir.glob("*.md")):
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
                filename = md_file.stem
                
                # Extract date from filename if possible
                date_match = re.search(r'(\d{2}_\d{2}_\d{2,4})|(\d{4}-\d{2}-\d{2})', filename)
                date = date_match.group() if date_match else None
                
                # Determine if this is likely Audrey's work
                is_audrey = True
                if 'v69' in filename or 'v73' in filename:
                    # Later versions might have ghost writer material
                    is_audrey = False
                    description = f"Later version - may contain ghost writer material"
                elif 'GW' in filename or 'gw' in filename:
                    # Explicitly marked as ghost writer
                    is_audrey = False
                    description = f"Ghost writer version"
                else:
                    description = f"Earlier version - likely Audrey's original work"
                
                versions[f'version_{version_num}'] = {
                    'content': content,
                    'source': str(md_file),
                    'priority': version_num if is_audrey else 100,  # Lower priority for non-Audrey
                    'date': date,
                    'is_audrey': is_audrey,
                    'description': description
                }
                version_num += 1
        
        return versions
    
    def extract_unique_paragraphs(self, versions: Dict[str, Dict]) -> Dict[str, List]:
        """Extract unique paragraphs from each version, prioritizing Audrey's work"""
        paragraph_sets = {}
        paragraph_to_version = defaultdict(list)
        
        # First, extract all paragraphs from highest priority (Audrey's) versions
        for version_id, version_data in sorted(versions.items(), key=lambda x: x[1]['priority']):
            content = version_data['content']
            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 50]
            
            paragraph_sets[version_id] = set()
            for para in paragraphs:
                # Normalize paragraph (remove extra whitespace, normalize quotes)
                normalized = re.sub(r'\s+', ' ', para).strip()
                if len(normalized) > 50:  # Only meaningful paragraphs
                    paragraph_sets[version_id].add(normalized)
                    paragraph_to_version[normalized].append(version_id)
        
        return paragraph_sets, paragraph_to_version
    
    def identify_audrey_core_material(self, versions: Dict[str, Dict]) -> Dict:
        """Identify core material that's most likely Audrey's original work"""
        paragraph_sets, paragraph_to_version = self.extract_unique_paragraphs(versions)
        
        # Material that appears in high-priority (Audrey's) versions
        audrey_versions = [v for v, d in versions.items() if d.get('is_audrey', True) and d['priority'] < 10]
        
        audrey_paragraphs = set()
        for version_id in audrey_versions:
            audrey_paragraphs.update(paragraph_sets.get(version_id, set()))
        
        # Remove paragraphs that only appear in low-priority (ghost writer) versions
        ghost_versions = [v for v, d in versions.items() if not d.get('is_audrey', True) or d['priority'] >= 10]
        ghost_paragraphs = set()
        for version_id in ghost_versions:
            ghost_paragraphs.update(paragraph_sets.get(version_id, set()))
        
        # Core Audrey material: appears in her versions but not exclusively in ghost writer versions
        core_material = audrey_paragraphs - (ghost_paragraphs - audrey_paragraphs)
        
        # Organize by source version
        organized_material = defaultdict(list)
        for para in core_material:
            sources = paragraph_to_version.get(para, [])
            # Find the highest priority source
            best_source = min(sources, key=lambda s: versions[s]['priority']) if sources else None
            if best_source:
                organized_material[best_source].append(para)
        
        return {
            'core_paragraphs': list(core_material),
            'organized_by_source': {k: v for k, v in organized_material.items()},
            'total_paragraphs': len(core_material),
            'sources': {k: versions[k]['description'] for k in organized_material.keys()}
        }
    
    def create_audrey_original_manuscript(self, core_material: Dict, versions: Dict = None) -> str:
        """Create a clean manuscript from Audrey's original material"""
        # Organize paragraphs in a logical order
        # Start with highest priority source
        all_paragraphs = []
        if versions:
            source_order = sorted(core_material['organized_by_source'].items(), 
                                key=lambda x: versions[x[0]]['priority'])
        else:
            source_order = list(core_material['organized_by_source'].items())
        
        for source_id, paragraphs in core_material['organized_by_source'].items():
            all_paragraphs.extend(paragraphs)
        
        # Create manuscript
        manuscript = f"""# The Shadow of Lillya
## Original Material by Audrey Berger Welz

**Compiled from:** {', '.join(core_material['sources'].values())}
**Total Paragraphs:** {core_material['total_paragraphs']}
**Compilation Date:** {datetime.now().strftime('%Y-%m-%d')}

---

## Note on Material

This manuscript contains only material identified as Audrey Berger Welz's original work, extracted from her draft versions. Material from later ghost writer versions or other sources has been excluded to preserve the authenticity of her voice and vision.

---

{chr(10).join(all_paragraphs)}
"""
        return manuscript
    
    def save_audrey_material(self, core_material: Dict, manuscript: str):
        """Save Audrey's original material"""
        # Save the compiled manuscript
        manuscript_file = self.audrey_material_dir / "audrey_original_compiled.md"
        with open(manuscript_file, 'w', encoding='utf-8') as f:
            f.write(manuscript)
        
        # Save metadata
        metadata = {
            'compilation_date': datetime.now().isoformat(),
            'total_paragraphs': core_material['total_paragraphs'],
            'sources': core_material['sources'],
            'paragraph_count_by_source': {k: len(v) for k, v in core_material['organized_by_source'].items()}
        }
        
        metadata_file = self.audrey_material_dir / "compilation_metadata.json"
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        
        # Save individual source files
        for source_id, paragraphs in core_material['organized_by_source'].items():
            source_file = self.audrey_material_dir / f"source_{source_id}.md"
            with open(source_file, 'w', encoding='utf-8') as f:
                f.write(f"# Material from {source_id}\n\n")
                f.write(f"{chr(10).join(paragraphs)}\n")
        
        return manuscript_file, metadata_file

def main():
    print("πŸ“š Extracting Audrey Berger Welz's Original Material...\n")
    
    extractor = AudreyMaterialExtractor()
    
    # Load all versions
    print("πŸ“– Loading all manuscript versions...")
    versions = extractor.load_all_versions()
    print(f"  βœ“ Loaded {len(versions)} versions\n")
    
    # Identify core material
    print("πŸ” Identifying Audrey's original material...")
    core_material = extractor.identify_audrey_core_material(versions)
    print(f"  βœ“ Identified {core_material['total_paragraphs']} paragraphs of original material")
    print(f"  βœ“ From {len(core_material['sources'])} source versions\n")
    
    # Create compiled manuscript
    print("πŸ“ Compiling original manuscript...")
    manuscript = extractor.create_audrey_original_manuscript(core_material, versions)
    
    # Save files
    print("πŸ’Ύ Saving files...")
    manuscript_file, metadata_file = extractor.save_audrey_material(core_material, manuscript)
    
    print(f"\nβœ… Complete!")
    print(f"  πŸ“„ Compiled manuscript: {manuscript_file}")
    print(f"  πŸ“Š Metadata: {metadata_file}")
    print(f"\nπŸ“‹ Material organized by source:")
    for source, count in core_material['paragraph_count_by_source'].items():
        print(f"  - {source}: {count} paragraphs")

if __name__ == '__main__':
    main()