File size: 1,806 Bytes
389c5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import asyncio
from dotenv import load_dotenv

from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document

# Load environment variables
load_dotenv("config.env")

# Set up LlamaParse
parser = LlamaParse(
    api_key=os.environ.get("LLAMAPARSE_API_KEY"),
    result_type="markdown",
    extract_charts=True,
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    bbox_top=0.05,
    bbox_bottom=0.1,
    verbose=True,
)

# Create output directory if it doesn't exist
os.makedirs("data/processed/lp/indices", exist_ok=True)

async def parse_docs():
    for filename in os.listdir("data/raw/GuidelinesSections"):
        if filename.endswith(".pdf"):
            filepath = f"data/raw/GuidelinesSections/{filename}"
            print(f"Processing: {filepath}")

            try:
                documents = await parser.aload_data(filepath)
            except Exception as e:
                print(f"❌ Failed to parse {filename}: {e}")
                continue

            full_text = "\n\n".join(doc.text for doc in documents)
            combined_doc = Document(text=full_text)

            node_parser = SimpleNodeParser()
            nodes = node_parser.get_nodes_from_documents([combined_doc])

            index = VectorStoreIndex(nodes)

            short_filename = (
                filename.replace("Kenya-ARV-Guidelines-2022-", "")
                .replace(".pdf", "")
            )

            index.storage_context.persist(persist_dir=f"data/processed/lp/indices/{short_filename}")
            print(f"✅ Saved index for {short_filename}")

if __name__ == "__main__":
    asyncio.run(parse_docs())