SwimSafer-Chatbot-v2 / scripts /ingest_documents.py
Woo Xu Hao
Initial commit for Hugging Face Space
e146c0f
import os
import sys
import time
import requests
from dotenv import load_dotenv
# Add the absolute path to the `app` directory
app_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "app"))
sys.path.insert(0, app_path)
from utils.embedding import get_query_embedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct
from qdrant_client.models import Distance, VectorParams
# Load environment variables
load_dotenv()
# Constants
RAW_DIR = "data/raw"
PARSED_DIR = "data/parsed"
COLLECTION_NAME = "chatbot_docs"
# Initialize Qdrant client
qdrant = QdrantClient(url=os.getenv("QDRANT_URL"))
# Create collection if it doesn't exist
if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
qdrant.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
# Function to parse document using LlamaParse API
def parse_document_with_llamaparse(file_path):
api_key = os.getenv("LLAMAPARSE_API_KEY")
headers = {
"Authorization": f"Bearer {api_key}",
"accept": "application/json"
}
# Step 1: Upload file
with open(file_path, "rb") as f:
files = {"file": f}
upload_url = "https://api.cloud.llamaindex.ai/api/v1/parsing/upload"
upload_response = requests.post(upload_url, headers=headers, files=files)
upload_response.raise_for_status()
print("Upload response:", upload_response.json())
job_id = upload_response.json()["id"]
# Step 2: Poll for job status
status_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}"
while True:
status_response = requests.get(status_url, headers=headers)
status_response.raise_for_status()
status_data = status_response.json()
if status_data["status"] == "completed":
break
elif status_data["status"] == "failed":
raise Exception(f"Parsing job failed for {file_path}")
time.sleep(2)
# Step 3: Get Markdown result
result_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown"
result_response = requests.get(result_url, headers=headers)
result_response.raise_for_status()
return result_response.text
# Process each file in RAW_DIR
for filename in os.listdir(RAW_DIR):
if filename == ".DS_Store":
continue
file_path = os.path.join(RAW_DIR, filename)
print(f"Processing: {filename}")
try:
# Parse document
parsed_text = parse_document_with_llamaparse(file_path)
# Save parsed text
parsed_file_path = os.path.join(PARSED_DIR, f"{filename}.txt")
with open(parsed_file_path, "w", encoding="utf-8") as f:
f.write(parsed_text)
# Embed and store in Qdrant
embedding = get_query_embedding(parsed_text)
point = PointStruct(id=filename, vector=embedding, payload={"text": parsed_text})
qdrant.upsert(collection_name=COLLECTION_NAME, points=[point])
print(f"✓ Successfully ingested: {filename}")
except Exception as e:
print(f"✗ Failed to process {filename}: {e}")