Spaces:
Runtime error
Runtime error
| import os | |
| import pinecone | |
| from pydantic import Field | |
| from vector_db import Document | |
| from html_parser import HTMLParser | |
| from langchain.vectorstores import Pinecone | |
| from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME | |
| from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID | |
| from langchain.embeddings import OpenAIEmbeddings | |
| WEBSITE_FOLDER = 'website' | |
| parser = HTMLParser() | |
| # initialize pinecone | |
| pinecone.init( | |
| api_key=PINECONE_API_KEY, # find at app.pinecone.io | |
| environment=PINECONE_ENVIRONMENT, # next to api key in console | |
| ) | |
| # Azure embedding model definition | |
| embeddings = OpenAIEmbeddings( | |
| deployment=EMBEDDING_DEPLOYMENT_ID, | |
| openai_api_key=EMBEDDING_API_KEY, | |
| openai_api_base=EMBEDDING_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| openai_api_version=OPENAI_API_VERSION, | |
| chunk_size=16 | |
| ) | |
| if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes(): | |
| pinecone.create_index( | |
| INDEX_NAME, | |
| metric="cosine", | |
| dimension=1536 | |
| ) | |
| print(f"Index {INDEX_NAME} created successfully") | |
| index = pinecone.Index(INDEX_NAME) | |
| index.delete(delete_all=True) | |
| files_src = os.listdir(WEBSITE_FOLDER) | |
| documents = [] | |
| for file in files_src: | |
| filepath = os.path.join(WEBSITE_FOLDER, file) | |
| filename = os.path.basename(filepath) | |
| data = parser.parse_file(filepath) | |
| texts= [] | |
| for d in data: | |
| texts.append(Document(page_content=d, metadata={"source": filepath})) | |
| documents.extend(texts) | |
| print(len(documents)) | |
| if len(documents)>0: | |
| document_id = [d.metadata['document_id'] + f"_{idx}" for (idx, d) in enumerate(documents)] | |
| Pinecone.from_documents(documents, embeddings, ids=document_id, index_name=INDEX_NAME) | |
| message = f"Add website to {INDEX_NAME} sucessfully" |