File size: 3,220 Bytes
e146c0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import sys
import time
import requests
from dotenv import load_dotenv

# Add the absolute path to the `app` directory
app_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "app"))
sys.path.insert(0, app_path)

from utils.embedding import get_query_embedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct
from qdrant_client.models import Distance, VectorParams

# Load environment variables
load_dotenv()

# Constants
RAW_DIR = "data/raw"
PARSED_DIR = "data/parsed"
COLLECTION_NAME = "chatbot_docs"

# Initialize Qdrant client
qdrant = QdrantClient(url=os.getenv("QDRANT_URL"))

# Create collection if it doesn't exist
if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )

# Function to parse document using LlamaParse API
def parse_document_with_llamaparse(file_path):
    api_key = os.getenv("LLAMAPARSE_API_KEY")
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }

    # Step 1: Upload file
    with open(file_path, "rb") as f:
        files = {"file": f}
        upload_url = "https://api.cloud.llamaindex.ai/api/v1/parsing/upload"
        upload_response = requests.post(upload_url, headers=headers, files=files)
        upload_response.raise_for_status()
        print("Upload response:", upload_response.json())
        job_id = upload_response.json()["id"]

    # Step 2: Poll for job status
    status_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}"
    while True:
        status_response = requests.get(status_url, headers=headers)
        status_response.raise_for_status()
        status_data = status_response.json()
        if status_data["status"] == "completed":
            break
        elif status_data["status"] == "failed":
            raise Exception(f"Parsing job failed for {file_path}")
        time.sleep(2)

    # Step 3: Get Markdown result
    result_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown"
    result_response = requests.get(result_url, headers=headers)
    result_response.raise_for_status()
    return result_response.text

# Process each file in RAW_DIR
for filename in os.listdir(RAW_DIR):
    if filename == ".DS_Store":
        continue

    file_path = os.path.join(RAW_DIR, filename)
    print(f"Processing: {filename}")

    try:
        # Parse document
        parsed_text = parse_document_with_llamaparse(file_path)

        # Save parsed text
        parsed_file_path = os.path.join(PARSED_DIR, f"{filename}.txt")
        with open(parsed_file_path, "w", encoding="utf-8") as f:
            f.write(parsed_text)

        # Embed and store in Qdrant
        embedding = get_query_embedding(parsed_text)
        point = PointStruct(id=filename, vector=embedding, payload={"text": parsed_text})
        qdrant.upsert(collection_name=COLLECTION_NAME, points=[point])

        print(f"✓ Successfully ingested: {filename}")

    except Exception as e:
        print(f"✗ Failed to process {filename}: {e}")