Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import requests | |
| from pymilvus import MilvusClient, DataType, Schema, Collection, utility | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY") | |
| ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT") | |
| ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN") | |
| def convert_pdf_to_json(file_path): | |
| url = "https://api.vertopal.com/v1/convert/file" | |
| headers = { | |
| "Authorization": f"Bearer {VERTOPAL_API_KEY}" | |
| } | |
| data = { | |
| "app": "[APP_ID]", | |
| "parameters": { | |
| "output": "json" | |
| } | |
| } | |
| files = { | |
| "file": open(file_path, "rb") | |
| } | |
| response = requests.post(url, headers=headers, data=data, files=files) | |
| response.raise_for_status() | |
| json_data = response.json() | |
| return json_data["result"]["output"]["connector"] | |
| def download_json_file(connector): | |
| url = "https://api.vertopal.com/v1/download/url/get" | |
| headers = { | |
| "Authorization": f"Bearer {VERTOPAL_API_KEY}" | |
| } | |
| data = { | |
| "app": "[APP_ID]", | |
| "connector": connector | |
| } | |
| response = requests.post(url, headers=headers, data=data) | |
| response.raise_for_status() | |
| json_data = response.json() | |
| return json_data | |
| def create_milvus_client_and_collection(collection_name): | |
| client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN) | |
| if utility.has_collection(collection_name): | |
| collection = Collection(collection_name) | |
| else: | |
| schema = Schema(enable_dynamic_field=True, description="") | |
| schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False) | |
| schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535) | |
| collection = client.create_collection(collection_name, schema=schema) | |
| return client, collection | |
| def upload_json_to_milvus(json_data, collection_name): | |
| client, collection = create_milvus_client_and_collection(collection_name) | |
| data = [ | |
| (len(collection), json.dumps(json_data)) | |
| ] | |
| collection.insert(data) | |
| def process_pdfs(directory): | |
| file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')] | |
| for file_path in file_paths: | |
| print(f"Processing file: {file_path}") | |
| connector = convert_pdf_to_json(file_path) | |
| json_data = download_json_file(connector) | |
| upload_json_to_milvus(json_data, "pdf_json_collection") | |
| print(f"Uploaded JSON data for file: {file_path}") | |
| def upload_persona_json(file_path): | |
| with open(file_path, "r") as f: | |
| persona_json = json.load(f) | |
| upload_json_to_milvus(persona_json, "persona_collection") | |
| print("Uploaded persona JSON to Milvus") | |
| if __name__ == "__main__": | |
| pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs" | |
| process_pdfs(pdf_directory) | |
| persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json" | |
| upload_persona_json(persona_json_path) |