File size: 9,965 Bytes
b773b72
 
 
 
 
 
 
 
 
 
 
a356e85
 
 
 
 
 
 
 
 
b773b72
 
 
 
 
 
 
a356e85
 
b773b72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a356e85
b773b72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a356e85
b773b72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9936e3
b773b72
 
 
 
 
 
 
 
 
a9936e3
b773b72
 
 
 
 
 
 
 
 
 
 
 
 
 
a9936e3
b773b72
 
a9936e3
b773b72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
"""
Data preparation script for Church Fathers Commentary API
Copies commentary embedding JSON files from church-fathers repo or generates new ones
"""

import json
import shutil
from pathlib import Path
import argparse


OLD_TESTAMENT_BOOKS = [
    "genesis", "exodus", "leviticus", "numbers", "deuteronomy", "joshua", "judges", "ruth",
    "1samuel", "2samuel", "1kings", "2kings", "1chronicles", "2chronicles", "ezra",
    "nehemiah", "esther", "job", "psalms", "proverbs", "ecclesiastes", "songofsolomon",
    "isaiah", "jeremiah", "lamentations", "ezekiel", "daniel", "hosea", "joel", "amos",
    "obadiah", "jonah", "micah", "nahum", "habakkuk", "zephaniah", "haggai", "zechariah",
    "malachi"
]

NEW_TESTAMENT_BOOKS = [
    "matthew", "mark", "luke", "john", "acts", "romans", "1corinthians", "2corinthians",
    "galatians", "ephesians", "philippians", "colossians", "1thessalonians", "2thessalonians",
    "1timothy", "2timothy", "titus", "philemon", "hebrews", "james", "1peter",
    "2peter", "1john", "2john", "3john", "jude", "revelation"
]

ALL_BOOKS = OLD_TESTAMENT_BOOKS + NEW_TESTAMENT_BOOKS


def copy_embeddings_from_source(source_dir: Path, output_dir: Path):
    """
    Copy commentary embeddings from church-fathers repository

    Args:
        source_dir: Path to church-fathers commentary_embeddings directory
        output_dir: Path to output directory (e.g., ./data)
    """

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    copied_count = 0
    total_entries = 0
    missing_books = []

    print(f"Copying embeddings from: {source_dir}")
    print(f"Output directory: {output_dir}")
    print("-" * 60)

    for book in ALL_BOOKS:
        book_dir = source_dir / book

        if not book_dir.exists():
            print(f"✗ {book} directory not found")
            missing_books.append(book)
            continue

        # Copy all JSON files for this book
        json_files = list(book_dir.glob("*.json"))

        if not json_files:
            print(f"✗ No JSON files found for {book}")
            missing_books.append(book)
            continue

        # Create book subdirectory in output
        book_output_dir = output_dir / book
        book_output_dir.mkdir(exist_ok=True)

        book_entries = 0
        for json_file in json_files:
            try:
                # Validate JSON structure
                with open(json_file, 'r') as f:
                    data = json.load(f)

                    # Verify required fields
                    if 'content' not in data or 'metadata' not in data or 'embedding' not in data:
                        print(f"  ✗ Skipping {json_file.name}: missing required fields")
                        continue

                    # Verify embedding is valid
                    if not isinstance(data['embedding'], list) or len(data['embedding']) == 0:
                        print(f"  ✗ Skipping {json_file.name}: invalid embedding")
                        continue

                # Copy file
                output_file = book_output_dir / json_file.name
                shutil.copy2(json_file, output_file)
                book_entries += 1

            except Exception as e:
                print(f"  ✗ Error processing {json_file.name}: {e}")
                continue

        if book_entries > 0:
            print(f"✓ {book}: copied {book_entries} files")
            copied_count += len(json_files)
            total_entries += book_entries
        else:
            print(f"✗ {book}: no valid files found")
            missing_books.append(book)

    print("-" * 60)
    print(f"\nCopy complete:")
    print(f"  Total files copied: {copied_count}")
    print(f"  Total entries: {total_entries}")
    print(f"  Books processed: {len(ALL_BOOKS) - len(missing_books)}/{len(ALL_BOOKS)}")

    if missing_books:
        print(f"  Missing books: {', '.join(missing_books)}")

    # Calculate total size
    if output_dir.exists():
        total_size = sum(f.stat().st_size for f in output_dir.rglob("*.json"))
        print(f"  Total size: {total_size / 1024 / 1024:.2f} MB")

    return total_entries, missing_books


def generate_embeddings_from_db(db_file: Path, output_dir: Path, model_name: str):
    """
    Generate embeddings from SQLite database using commentary.py approach

    Args:
        db_file: Path to SQLite database (data.sqlite)
        output_dir: Path to output directory (e.g., ./data)
        model_name: HuggingFace model name (default: BAAI/bge-large-en-v1.5)
    """
    import sqlite3
    from datetime import datetime
    from tqdm import tqdm
    from sentence_transformers import SentenceTransformer

    print("WARNING: This will generate embeddings from scratch. This may take a long time!")
    print(f"Using model: {model_name}")
    print(f"Database: {db_file}")
    print("-" * 60)

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load model
    print("Loading embedding model...")
    model = SentenceTransformer(model_name)

    # Connect to database
    connection = sqlite3.connect(db_file)
    cursor = connection.cursor()

    # Query church fathers (NT-only, 9 fathers)
    top_authors = [
        "Augustine of Hippo",
        "Athanasius of Alexandria",
        "Basil of Caesarea",
        "Gregory of Nazianzus",
        "Gregory of Nyssa",
        "Cyril of Alexandria",
        "Irenaeus",
        "Cyprian",
        "Origen of Alexandria"
    ]

    query = """
        SELECT id, father_name, file_name, append_to_author_name, ts, book,
               location_start, location_end, txt, source_url, source_title
        FROM commentary
        WHERE father_name IN ({})
        AND book IN ({})
        AND append_to_author_name NOT LIKE '%quoted by Aquinas%'
        AND LENGTH(txt) >= 1000
        AND source_title IS NOT NULL
        AND source_title != ''
    """.format(
        ','.join('?' * len(top_authors)),
        ','.join('?' * len(NEW_TESTAMENT_BOOKS))
    )

    cursor.execute(query, top_authors + NEW_TESTAMENT_BOOKS)
    rows = cursor.fetchall()

    print(f"Found {len(rows)} commentary entries to process")

    # Process each row
    for row in tqdm(rows, desc="Generating embeddings"):
        id, father_name, file_name, append_to_author_name, ts, book, \
        location_start, location_end, txt, source_url, source_title = row

        # Generate embedding
        embedding = model.encode(txt, normalize_embeddings=True).tolist()

        # Prepare data
        data = {
            "content": txt,
            "metadata": {
                "id": id,
                "father_name": father_name,
                "book": book,
                "location_start": location_start,
                "location_end": location_end,
                "source_url": source_url,
                "source_title": source_title,
                "append_to_author_name": append_to_author_name
            },
            "embedding": embedding
        }

        # Save to file
        book_dir = output_dir / book
        book_dir.mkdir(exist_ok=True)

        # Generate unique filename
        safe_father = father_name.replace(' ', '_')
        filename = f"{book}_{safe_father}_{id}.json"

        with open(book_dir / filename, 'w') as f:
            json.dump(data, f)

    cursor.close()
    connection.close()

    print("✓ Embedding generation complete!")


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Prepare Church Fathers commentary embeddings for HF Spaces")
    parser.add_argument(
        "--source",
        type=str,
        help="Source directory containing church-fathers commentary_embeddings (e.g., ../church-fathers/commentary_embeddings)"
    )
    parser.add_argument(
        "--output",
        type=str,
        default="./data",
        help="Output directory for JSON files (default: ./data)"
    )
    parser.add_argument(
        "--generate",
        action="store_true",
        help="Generate embeddings from SQLite database instead of copying"
    )
    parser.add_argument(
        "--db",
        type=str,
        help="Path to SQLite database file (required if --generate is used)"
    )
    parser.add_argument(
        "--model",
        type=str,
        default="BAAI/bge-large-en-v1.5",
        help="Model name for embedding generation (default: BAAI/bge-large-en-v1.5)"
    )

    args = parser.parse_args()

    output_dir = Path(args.output)

    if args.generate:
        # Generate embeddings from database
        if not args.db:
            print("Error: --db is required when using --generate")
            return 1

        db_file = Path(args.db)
        if not db_file.exists():
            print(f"Error: Database file not found: {db_file}")
            return 1

        generate_embeddings_from_db(db_file, output_dir, args.model)

    else:
        # Copy embeddings from source
        if not args.source:
            print("Error: --source is required unless using --generate")
            print("\nUsage:")
            print("  Copy embeddings:    python prepare_data.py --source ../church-fathers/commentary_embeddings")
            print("  Generate new:       python prepare_data.py --generate --db path/to/data.sqlite")
            return 1

        source_dir = Path(args.source)
        if not source_dir.exists():
            print(f"Error: Source directory does not exist: {source_dir}")
            return 1

        total, missing = copy_embeddings_from_source(source_dir, output_dir)

        if total == 0:
            print("\nNo embeddings were copied. Please check the source directory.")
            return 1

    print("\n✓ Data preparation complete!")
    print("\nNext steps:")
    print("  1. Review the data/ directory")
    print("  2. Test locally: uvicorn app:app --reload")
    print("  3. Deploy to Hugging Face Spaces")

    return 0


if __name__ == "__main__":
    exit(main())