Spaces:
Sleeping
Sleeping
| import glob | |
| import os | |
| from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_pinecone import PineconeVectorStore | |
| from pinecone.grpc import PineconeGRPC as Pinecone | |
| from pinecone import ServerlessSpec | |
| import time | |
| from langchain_community.embeddings import SentenceTransformerEmbeddings | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ๋ฐ์ดํฐ ๋ฐ์ผ๋ฉด ๊ฐ๋ผ์ค | |
| def come_data(splits): | |
| docs = [] | |
| for i in range(len(splits)): | |
| spcon = splits[i].page_content | |
| url = splits[i].metadata['source'] | |
| con = Document(page_content=spcon, metadata={'source': url}) | |
| docs.append(con) | |
| return docs | |
| # ํํํ | |
| def flatten_list(lst): | |
| return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst] | |
| # ๋ชจ๋ธ ๋ถ๋ฌ์์ VectorDB๋ก ์ฌ๋ฆฌ๋ ๋ถ๋ถ | |
| def all_files(path): | |
| print(f'RAG์ ๋ค์ด๊ฐ ๋ชจ๋ ๋ฐ์ดํฐ๋ {path}์ ๋ด์์ฃผ์ธ์.\n\n\n') | |
| f = glob.glob(path + '/**', recursive=True) | |
| f_docs = [] | |
| for file in f: | |
| a = False | |
| if file.endswith('.txt'): | |
| loader = TextLoader(file) | |
| document = loader.load() | |
| a = True | |
| elif file.endswith('.csv'): | |
| loader = CSVLoader(file) | |
| document = loader.load() | |
| a = True | |
| elif file.endswith('.pdf'): | |
| loader = PyMuPDFLoader(file) | |
| document = loader.load() | |
| a = True | |
| # ------------------- ํ์ผ ํ์ ์ถ๊ฐ ์ฌํญ ์์ ์ ์์ ์ถ๊ฐ ----------------# | |
| if a: | |
| print(file.split('/')[-1] + ' split ์งํ ์ค') | |
| text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
| separator=".", | |
| chunk_size=500, | |
| chunk_overlap=0, | |
| ) | |
| splits = text_splitter.split_documents(document) | |
| docs = come_data(splits) | |
| f_docs.append(docs) | |
| print(file.split('/')[-1] + ' split ์งํ ์๋ฃ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ : ' + str(len(docs))) | |
| flattened_list = flatten_list(f_docs) | |
| ''' | |
| flattened ๋ docs๋ฅผ ๋ฒกํฐ db๋ก ๋ฃ์ด์ค ๊ฒ | |
| ''' | |
| # ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ ์ธ | |
| embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True}) | |
| # ๋ฒกํฐ์คํ ์ด ์ ์ธ | |
| api_key = os.environ['PINECONE_API_KEY'] | |
| pc = Pinecone(api_key=api_key) | |
| index_name = os.getenv('INDEX_NAME') | |
| print('Vector DB ์ด๊ธฐํ. Index_name = ' + str(index_name)) | |
| spec = ServerlessSpec(cloud='aws', region='us-east-1') | |
| # ์ธ๋ฑ์ค ์กด์ฌ์ฌ๋ถ ํ์ธ ๋ฐ ์ญ์ | |
| collect_name = [] | |
| for n in pc.list_indexes().indexes: | |
| collect_name.append(n.name) | |
| if index_name in collect_name: | |
| pc.delete_index(index_name) | |
| print('๊ธฐ์กด ์ธ๋ฑ์ค ์ญ์ ์๋ฃ') | |
| time.sleep(3) | |
| # ํ์ธ์ฝ ์ธ๋ฑ์ค ์์ฑ | |
| pc.create_index( | |
| index_name, | |
| dimension=768, | |
| metric='cosine', | |
| spec=spec | |
| ) | |
| # ์ธ๋ฑ์ค ์ฌ์์ฑ ๋ฐ ๋ฐ์ดํฐ ์ ๋ ฅ | |
| # index = pc.Index(index_name) | |
| print('Vector DB ๋ค์ด๊ฐ๋ ์ค. Index_name = ' + str(index_name)) | |
| # # ํ ์คํธ ์๋ฒ ๋ฉ ์์ฑ | |
| # texts = [doc.page_content for doc in flattened_list] | |
| # embedded_texts = [] | |
| # for txt in texts: | |
| # embedded_texts.append(embedding_model.embed_query(txt)) | |
| # # ๋ฒกํฐ DB์ ์๋ฒ ๋ฉ ์ถ๊ฐ | |
| # ids = [str(i) for i in range(len(embedded_texts))] | |
| # metadata = [doc.metadata for doc in flattened_list] | |
| # # db์ฌ๋ฆด๋ ๋ฌด๋ฃ๋ฒ์ ์ด๊ธฐ๋๋ฌธ์ ์ฉ๋ ํฐ์ง -> ๋๋ ์ ์ฌ๋ฆฌ์ | |
| # batch_size = 28 | |
| # for i in range(0, len(embedded_texts), batch_size): | |
| # batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])] | |
| # index.upsert(vectors=batch_vectors) | |
| Vectorstore = PineconeVectorStore.from_documents( | |
| documents=flattened_list, | |
| index_name=index_name, | |
| embedding=embedding_model | |
| ) | |
| print('์ ์ฅ ์๋ฃ') | |
| return Vectorstore, flattened_list |