File size: 3,972 Bytes
e66ee1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efc08f7
 
 
 
 
e66ee1b
 
efc08f7
e66ee1b
 
 
 
efc08f7
e66ee1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efc08f7
 
 
e66ee1b
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Reprocess all documents with new chunk parameters

This script will:
1. Fetch all document IDs from the database
2. Call the reprocess API endpoint for each document
3. Report progress and results
"""

import asyncio
import sys
from pathlib import Path

import httpx
from sqlalchemy import select

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from backend.database import async_session  # noqa: E402
from backend.models.document import Document  # noqa: E402


async def get_all_document_ids() -> list[tuple[int, str]]:
    """Get all document IDs from database"""
    async with async_session() as session:
        result = await session.execute(select(Document.id, Document.title))
        docs = result.all()
        return list(docs)


async def reprocess_document(doc_id: int, chunk_size: int = 400, overlap: int = 80) -> dict:
    """Call reprocess API for a single document"""
    async with httpx.AsyncClient(timeout=300.0) as client:
        response = await client.post(
            f"http://localhost:8000/api/ingest/reprocess/{doc_id}",
            json={"chunk_size": chunk_size, "overlap": overlap},
        )

        if response.status_code == 200:
            return {"success": True, "data": response.json()}
        else:
            return {"success": False, "error": f"{response.status_code}: {response.text}"}


async def main():
    print("πŸ”„ Reprocessing All Documents with New Chunk Parameters")
    print("=" * 80)
    print("   New chunk_size: 400")
    print("   New overlap: 80")
    print("=" * 80)

    # Get all documents
    print("\nπŸ“‹ Fetching document list...")
    docs = await get_all_document_ids()

    if not docs:
        print("⚠️  No documents found in database")
        return

    print(f"βœ… Found {len(docs)} documents\n")

    # Reprocess each document
    results = []
    for i, (doc_id, title) in enumerate(docs, 1):
        print(f"[{i}/{len(docs)}] Processing: {title} (ID: {doc_id})")

        try:
            result = await reprocess_document(doc_id)

            if result["success"]:
                data = result["data"]
                print("   βœ… Success!")
                print(f"      Old chunks: {data['old_chunks_deleted']}")
                print(f"      New chunks: {data['new_chunks_created']}")
                print(f"      Embeddings: {data['new_embeddings_created']}")
                results.append({"doc_id": doc_id, "title": title, "success": True, "data": data})
            else:
                print(f"   ❌ Failed: {result['error']}")
                results.append(
                    {"doc_id": doc_id, "title": title, "success": False, "error": result["error"]}
                )

        except Exception as e:
            print(f"   ❌ Exception: {str(e)}")
            results.append({"doc_id": doc_id, "title": title, "success": False, "error": str(e)})

        print()

    # Summary
    print("=" * 80)
    print("πŸ“Š SUMMARY")
    print("=" * 80)

    successful = sum(1 for r in results if r["success"])
    failed = len(results) - successful

    print(f"βœ… Successful: {successful}")
    print(f"❌ Failed: {failed}")

    if successful > 0:
        total_old_chunks = sum(r["data"]["old_chunks_deleted"] for r in results if r["success"])
        total_new_chunks = sum(r["data"]["new_chunks_created"] for r in results if r["success"])
        print(f"\nπŸ“ˆ Total old chunks deleted: {total_old_chunks}")
        print(f"πŸ“ˆ Total new chunks created: {total_new_chunks}")
        increase = total_new_chunks - total_old_chunks
        percent = (total_new_chunks / total_old_chunks - 1) * 100
        print(f"πŸ“ˆ Chunk increase: {increase:+d} ({percent:.1f}%)")

    if failed > 0:
        print("\n❌ Failed documents:")
        for r in results:
            if not r["success"]:
                print(f"   - {r['title']} (ID: {r['doc_id']}): {r['error']}")


if __name__ == "__main__":
    asyncio.run(main())