Syluh27 commited on
Commit ·
9c363a9
1
Parent(s): cf5fda3
model.py
CHANGED
|
@@ -12,25 +12,16 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
| 12 |
if HF_TOKEN is None:
|
| 13 |
raise ValueError("No se encontró la variable de entorno HF_TOKEN.")
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
os.makedirs(chroma_dir, exist_ok=True) # Asegurar que el directorio existe
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
# Eliminar caché previa y datos antiguos de Chroma
|
| 21 |
-
def limpiar_entorno():
|
| 22 |
cache_path = "/home/user/.cache/huggingface/hub/datasets--VictorCarr02--Conversational-Agent-LawsEC"
|
| 23 |
if os.path.exists(cache_path):
|
| 24 |
-
print(f"Eliminando caché: {cache_path}")
|
| 25 |
shutil.rmtree(cache_path)
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
shutil.rmtree(chroma_dir)
|
| 30 |
-
os.makedirs(chroma_dir, exist_ok=True)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
limpiar_entorno()
|
| 34 |
|
| 35 |
# Descargar los archivos
|
| 36 |
embedding_path = hf_hub_download(
|
|
@@ -38,58 +29,45 @@ embedding_path = hf_hub_download(
|
|
| 38 |
repo_type="dataset",
|
| 39 |
filename="data_level0.bin",
|
| 40 |
token=HF_TOKEN,
|
| 41 |
-
force_download=True
|
| 42 |
)
|
| 43 |
|
| 44 |
-
|
| 45 |
-
temp_chroma_path = hf_hub_download(
|
| 46 |
repo_id="VictorCarr02/Conversational-Agent-LawsEC",
|
| 47 |
repo_type="dataset",
|
| 48 |
filename="chroma.sqlite3",
|
| 49 |
token=HF_TOKEN,
|
| 50 |
-
force_download=True
|
| 51 |
)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
print(f"
|
|
|
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
chromadb_client = chromadb.PersistentClient(path=
|
| 63 |
collection = chromadb_client.get_or_create_collection(name="mis_embeddings")
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
embeddings = HuggingFaceEmbeddings(
|
| 67 |
-
model_name="sentence-transformers/all-mpnet-base-v2", # Modelo estándar
|
| 68 |
-
model_kwargs={"device": "cpu"}
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
# Inicializar vector store
|
| 72 |
-
vector_store = Chroma(
|
| 73 |
-
client=chromadb_client,
|
| 74 |
-
collection_name="mis_embeddings",
|
| 75 |
-
embedding_function=embeddings
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
# Configurar modelo Mistral
|
| 79 |
api_key = os.getenv("MISTRAL_API_KEY")
|
| 80 |
-
if not api_key:
|
| 81 |
-
raise ValueError("MISTRAL_API_KEY no configurada")
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
temperature=0.1
|
| 87 |
-
)
|
| 88 |
|
| 89 |
-
# Crear
|
|
|
|
|
|
|
|
|
|
| 90 |
rag_chain = RetrievalQA.from_chain_type(
|
| 91 |
llm=llm,
|
| 92 |
-
retriever=vector_store.as_retriever(
|
| 93 |
-
chain_type="stuff"
|
| 94 |
-
|
| 95 |
-
)
|
|
|
|
| 12 |
if HF_TOKEN is None:
|
| 13 |
raise ValueError("No se encontró la variable de entorno HF_TOKEN.")
|
| 14 |
|
| 15 |
+
# Eliminar caché previa para forzar la descarga
|
| 16 |
+
def eliminar_cache_huggingface():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
cache_path = "/home/user/.cache/huggingface/hub/datasets--VictorCarr02--Conversational-Agent-LawsEC"
|
| 18 |
if os.path.exists(cache_path):
|
| 19 |
+
print(f"Eliminando caché existente en: {cache_path}")
|
| 20 |
shutil.rmtree(cache_path)
|
| 21 |
+
print("Caché eliminada. Forzando nueva descarga.")
|
| 22 |
|
| 23 |
+
# Eliminar la caché antes de la descarga
|
| 24 |
+
eliminar_cache_huggingface()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Descargar los archivos
|
| 27 |
embedding_path = hf_hub_download(
|
|
|
|
| 29 |
repo_type="dataset",
|
| 30 |
filename="data_level0.bin",
|
| 31 |
token=HF_TOKEN,
|
| 32 |
+
force_download=True # Fuerza la descarga
|
| 33 |
)
|
| 34 |
|
| 35 |
+
chroma_path = hf_hub_download(
|
|
|
|
| 36 |
repo_id="VictorCarr02/Conversational-Agent-LawsEC",
|
| 37 |
repo_type="dataset",
|
| 38 |
filename="chroma.sqlite3",
|
| 39 |
token=HF_TOKEN,
|
| 40 |
+
force_download=True # Fuerza la descarga
|
| 41 |
)
|
| 42 |
|
| 43 |
+
print("Archivos descargados en:")
|
| 44 |
+
print(f"Embeddings: {embedding_path}")
|
| 45 |
+
print(f"ChromaDB: {chroma_path}")
|
| 46 |
|
| 47 |
+
# Verificar y eliminar el archivo chroma.sqlite3 si ya existe
|
| 48 |
+
if os.path.exists(chroma_path):
|
| 49 |
+
print(f"Eliminando archivo existente: {chroma_path}")
|
| 50 |
+
os.remove(chroma_path)
|
| 51 |
|
| 52 |
+
# Cargar ChromaDB y los embeddings
|
| 53 |
+
chromadb_client = chromadb.PersistentClient(path=chroma_path)
|
| 54 |
collection = chromadb_client.get_or_create_collection(name="mis_embeddings")
|
| 55 |
+
embeddings = HuggingFaceEmbeddings(model_name="mistralai/MistralAIEmbeddings", path=embedding_path)
|
| 56 |
+
vector_store = Chroma(collection=collection, embedding_function=embeddings)
|
| 57 |
|
| 58 |
+
# Acceder a la clave API desde la variable de entorno
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
api_key = os.getenv("MISTRAL_API_KEY")
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
# Verifica si la clave fue obtenida correctamente
|
| 62 |
+
if api_key is None:
|
| 63 |
+
raise ValueError("La clave API MISTRAL_API_KEY no está configurada como variable de entorno.")
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
# Crear el modelo LLM con la clave API
|
| 66 |
+
llm = ChatMistralAI(api_key=api_key)
|
| 67 |
+
|
| 68 |
+
# Crear el agente RAG
|
| 69 |
rag_chain = RetrievalQA.from_chain_type(
|
| 70 |
llm=llm,
|
| 71 |
+
retriever=vector_store.as_retriever(),
|
| 72 |
+
chain_type="stuff"
|
| 73 |
+
)
|
|
|