Spaces:

GreyWolfEvan
/

SwimSafer-Chatbot-v2

Configuration error

SwimSafer-Chatbot-v2 / scripts /ingest_documents.py

Woo Xu Hao

Initial commit for Hugging Face Space

e146c0f 7 months ago

3.22 kB

	import os
	import sys
	import time
	import requests
	from dotenv import load_dotenv

	# Add the absolute path to the `app` directory
	app_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "app"))
	sys.path.insert(0, app_path)

	from utils.embedding import get_query_embedding
	from qdrant_client import QdrantClient
	from qdrant_client.http.models import PointStruct
	from qdrant_client.models import Distance, VectorParams

	# Load environment variables
	load_dotenv()

	# Constants
	RAW_DIR = "data/raw"
	PARSED_DIR = "data/parsed"
	COLLECTION_NAME = "chatbot_docs"

	# Initialize Qdrant client
	qdrant = QdrantClient(url=os.getenv("QDRANT_URL"))

	# Create collection if it doesn't exist
	if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
	qdrant.recreate_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
	)

	# Function to parse document using LlamaParse API
	def parse_document_with_llamaparse(file_path):
	api_key = os.getenv("LLAMAPARSE_API_KEY")
	headers = {
	"Authorization": f"Bearer {api_key}",
	"accept": "application/json"
	}

	# Step 1: Upload file
	with open(file_path, "rb") as f:
	files = {"file": f}
	upload_url = "https://api.cloud.llamaindex.ai/api/v1/parsing/upload"
	upload_response = requests.post(upload_url, headers=headers, files=files)
	upload_response.raise_for_status()
	print("Upload response:", upload_response.json())
	job_id = upload_response.json()["id"]

	# Step 2: Poll for job status
	status_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}"
	while True:
	status_response = requests.get(status_url, headers=headers)
	status_response.raise_for_status()
	status_data = status_response.json()
	if status_data["status"] == "completed":
	break
	elif status_data["status"] == "failed":
	raise Exception(f"Parsing job failed for {file_path}")
	time.sleep(2)

	# Step 3: Get Markdown result
	result_url = f"https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown"
	result_response = requests.get(result_url, headers=headers)
	result_response.raise_for_status()
	return result_response.text

	# Process each file in RAW_DIR
	for filename in os.listdir(RAW_DIR):
	if filename == ".DS_Store":
	continue

	file_path = os.path.join(RAW_DIR, filename)
	print(f"Processing: {filename}")

	try:
	# Parse document
	parsed_text = parse_document_with_llamaparse(file_path)

	# Save parsed text
	parsed_file_path = os.path.join(PARSED_DIR, f"{filename}.txt")
	with open(parsed_file_path, "w", encoding="utf-8") as f:
	f.write(parsed_text)

	# Embed and store in Qdrant
	embedding = get_query_embedding(parsed_text)
	point = PointStruct(id=filename, vector=embedding, payload={"text": parsed_text})
	qdrant.upsert(collection_name=COLLECTION_NAME, points=[point])

	print(f"✓ Successfully ingested: {filename}")

	except Exception as e:
	print(f"✗ Failed to process {filename}: {e}")