Spaces:
Runtime error
Runtime error
| import os | |
| os.environ["OPENAI_API_KEY"] = "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" | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.vectorstores import Chroma | |
| import openai | |
| from pypinyin import lazy_pinyin | |
| from tqdm import tqdm | |
| embedding = OpenAIEmbeddings() | |
| def list_files(directory): | |
| select = [] | |
| for root, dirs, files in os.walk(directory): | |
| for file in files: | |
| select.append(os.path.join(root, file)) | |
| return select | |
| if __name__ == "__main__": | |
| domains = ["农业", "宗教与文化", "建筑业与制造业", "医疗卫生保健", "国家治理", "法律法规", "财政税收", "教育", "金融", "贸易", "宏观经济", "社会发展", "科学技术", "能源环保", "国际关系", "国防安全"] | |
| for domain_name in domains: | |
| directory_path = f"./example_data/{domain_name}" | |
| select_files = list_files(directory_path) | |
| select_pages = [] | |
| for i, item in tqdm(enumerate(select_files)): | |
| print(item) | |
| loader = PyPDFLoader(item) | |
| pages = loader.load_and_split() | |
| select_pages.extend(pages) | |
| pinyin = "".join(lazy_pinyin(domain_name)) | |
| persist_vector_path = f"./vector_data/{pinyin}_{len(select_files)}_{len(select_pages)}" | |
| print(persist_vector_path) | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| documents = text_splitter.split_documents(select_pages) | |
| db = Chroma.from_documents(documents, OpenAIEmbeddings(), persist_directory=persist_vector_path) | |
| # db = Chroma(persist_directory='path', embedding_function=embedding) | |
| # docs = db.similarity_search_with_score(query="宏观经济有什么影响", k=3) | |
| # contents = [doc[0] for doc in docs] | |
| # relevance = " ".join(doc.page_content for doc in contents) | |
| # source = [doc.metadata for doc in contents] |