Spaces:
Running
Running
| import os | |
| import json | |
| import time | |
| import pandas as pd | |
| import time | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_mistralai.embeddings import MistralAIEmbeddings | |
| from langchain_pinecone import PineconeVectorStore | |
| from langchain_core.documents import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from pinecone import Pinecone, ServerlessSpec | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| index_name = os.environ.get("PINECONE_INDEX_NAME") | |
| # namespace = os.environ.get("PINECONE_NAMESPACE") | |
| pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY")) | |
| existing_indexes = [index_info["name"] for index_info in pc.list_indexes()] | |
| if index_name not in existing_indexes: | |
| pc.create_index( | |
| name=index_name, | |
| dimension=1024, | |
| metric="cosine", | |
| spec=ServerlessSpec(cloud="aws", region="us-east-1"), | |
| ) | |
| while not pc.describe_index(index_name).status["ready"]: | |
| time.sleep(1) | |
| index = pc.Index(index_name) | |
| print(index_name) | |
| def get_text_chunks(text): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, # the character length of the chunck | |
| chunk_overlap=100, # the character length of the overlap between chuncks | |
| length_function=len # the length function - in this case, character length (aka the python len() fn.) | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def importAideEntreprise(subvention, source): | |
| try: | |
| # Initialiser le modèle d'embeddings OpenAI | |
| # embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle | |
| embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY")) | |
| vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace | |
| if source == "aides_entreprises.json": | |
| if 'contacts' in subvention: | |
| del subvention['contacts'] | |
| if 'contact' in subvention: | |
| del subvention['contact'] | |
| if 'profils' in subvention: | |
| del subvention['profils'] | |
| if 'projets' in subvention: | |
| del subvention['projets'] | |
| if 'cache_indexation' in subvention: | |
| del subvention['cache_indexation'] | |
| metadata = { | |
| **subvention.get("metadata", {}), | |
| "id_subvention": subvention['id_aid'], | |
| "deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1, | |
| "id_document": f"entreprises_{subvention['id_aid']}" | |
| } | |
| elif source == "aides_territoires.json": | |
| metadata = { | |
| **subvention.get("metadata", {}), | |
| "id_subvention": subvention['id'], | |
| "deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1, | |
| "id_document": f"territoires_{subvention['id']}" | |
| } | |
| elif source == "les_aides.json": | |
| if 'cci' in subvention: | |
| del subvention['cci'] | |
| if 'url' in subvention: | |
| del subvention['url'] | |
| metadata = subvention.get("metadata", {}) | |
| if "sirets" in metadata: | |
| del metadata["sirets"] | |
| metadata = { | |
| **metadata, | |
| "id_subvention": subvention['numero'], | |
| "deadline_date": -1, | |
| "id_document": f"aides_{subvention['numero']}" | |
| } | |
| # Combine JSON data as a single document for embedding | |
| json_text = json.dumps(subvention, indent=4) # Convert JSON document to string | |
| document = Document(page_content=json_text, metadata=metadata) # Create document with metadata | |
| # Generate a unique ID | |
| uuid = metadata["id_document"] # Using 'id_document' as a unique ID | |
| print("Before add_documents") | |
| # Embed and store the document in Pinecone | |
| vector_store.add_documents(documents=[document], ids=[uuid]) | |
| print(f"Stored document with ID: {uuid} from source: {source}") | |
| except Exception as e: | |
| print(f"Error storing document: {e}") | |
| print(e) | |
| import traceback | |
| print(f"Error storing document: {e}") | |
| traceback.print_exc() | |
| def loopSubventions(subventions,source): | |
| for subv in subventions: | |
| importAideEntreprise(subv,source) | |
| time.sleep(0.5) | |
| def go(): | |
| print("Importing Aide Entreprise data...") | |
| for file_name in os.listdir('data'): | |
| if file_name.endswith(".json"): | |
| print(file_name) | |
| file_path = os.path.join('data', file_name) | |
| if file_name == "les_aides.json": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| loopSubventions(data,source=file_name) | |
| if __name__ == "__main__": | |
| go() |