Spaces:
Runtime error
Runtime error
| from loguru import logger | |
| import json | |
| from bin_public.utils.utils_db import * | |
| from bin_public.config.presets import MIGRAINE_PROMPT | |
| import PyPDF2 | |
| import pinecone | |
| from langchain.vectorstores import Pinecone | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| PINECONE_API_KEY = os.environ['PINECONE_API_KEY'] | |
| PINECONE_API_ENV = os.environ['PINECONE_API_ENV'] | |
| def load_local_file_PDF(path, file_name): | |
| result = {} | |
| temp = '' | |
| pdf_reader = PyPDF2.PdfReader(open(path, 'rb')) | |
| for i in range(len(pdf_reader.pages)): | |
| pages = pdf_reader.pages[i] | |
| temp += pages.extract_text() | |
| if file_name.endswith('.pdf'): | |
| index = file_name[:-4] | |
| temp = temp.replace('\n', '').replace('\t', '') | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| texts = text_splitter.split_text(temp) | |
| i = 0 | |
| for content in texts: | |
| result[f'{index}_{i}'] = content | |
| i += 1 | |
| return result | |
| def holo_query_insert_file_contents(file_name, file_content): | |
| run_sql = f""" | |
| insert into s_context( | |
| file_name, | |
| content | |
| ) | |
| select | |
| '{file_name}' as file_name, | |
| '{file_content}' as content | |
| """ | |
| holo_query_func(run_sql, is_query=0) | |
| def holo_query_get_content(run_sql): | |
| temp = [] | |
| data = holo_query_func(run_sql, is_query=1) | |
| for i in data: | |
| temp.append(i[1].replace('\n', '').replace('\t', '')) | |
| return temp | |
| def pdf2database(path, file_name): | |
| temp = '' | |
| pdf_reader = PyPDF2.PdfReader(open(path, 'rb')) | |
| for i in range(len(pdf_reader.pages)): | |
| pages = pdf_reader.pages[i] | |
| temp += pages.extract_text() | |
| if file_name.endswith('.pdf'): | |
| index = file_name[:-4] | |
| temp = temp.replace('\n', '').replace('\t', '') | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| texts = text_splitter.split_text(temp) | |
| for i in range(len(texts)): | |
| holo_query_insert_file_contents(f'{index}_{i}', f'{texts[i]}') | |
| logger.info(f'{index}_{i} stored') | |
| def load_json(path): | |
| with open(path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return data | |
| def get_content_from_json(path): | |
| result = [] | |
| data = load_json(path) | |
| for item in data: | |
| key = list(item.keys())[0] | |
| value = item[key] | |
| result.append(key + ',' + value) | |
| return result | |
| def data2embeddings(index_name, data, embeddings): | |
| pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV) | |
| Pinecone.from_texts([t for t in data], embeddings, index_name=index_name) | |
| logger.info("Stored Successfully") | |
| def context_construction(api_key, query, model, pinecone_api_key, pinecone_api_env, temperature, index_name, mode="map_reduce"): | |
| temp = [] | |
| embeddings = OpenAIEmbeddings(openai_api_key=api_key) | |
| # llm = OpenAI(temperature=temperature, openai_api_key=api_key, model_name=model) | |
| pinecone.init(api_key=pinecone_api_key, environment=pinecone_api_env) | |
| docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings) | |
| # chain = load_qa_chain(llm, chain_type=mode) | |
| if not any(char.isalnum() for char in query): | |
| return " ", MIGRAINE_PROMPT, "Connecting to Pinecone" | |
| else: | |
| docs = docsearch.similarity_search(query, include_metadata=True, k=2) | |
| # response = chain.run(input_documents=docs, question=str(query)) | |
| for i in docs: | |
| temp.append(i.page_content) | |
| return '用以下资料进行辅助回答\n' + ' '.join(temp), '\n' + ' '.join(temp), "Connecting to Pinecone" | |
| def chat_prerequisites(input, filter, embeddings, top_k=4): | |
| # filter : dic | |
| # input_prompt = '只基于以下规范的两种分类对形如 "position_name: xx job_name: xx job_description: xxx"的描述进行分类,只要回复规范的类别名' | |
| input_prompt = '接下来我会给你一段"不规范的招聘职位描述",以及4个用(选项一,选项二,选项三,选项四)四个选项表示的规范的职业分类描述。' \ | |
| '你需要将"不规范的招聘职位描述"归类为”选项一“或“选项二”或“选项三”或“选项四”。' \ | |
| '你只需要回复”选项一“或“选项二”或“选项三”或“选项四”,不要回复任何别的东西' | |
| query = input_prompt + input | |
| temp = [] | |
| docsearch = Pinecone.from_existing_index(index_name=pinecone.list_indexes()[0], embedding=embeddings) | |
| docs = docsearch.similarity_search(query, k=top_k, filter=filter) | |
| for index, i in enumerate(docs): | |
| if index == 0: | |
| temp.append("选项一:" + i.page_content + "##") | |
| if index == 1: | |
| temp.append("选项二:" + i.page_content + "##") | |
| if index == 2: | |
| temp.append("选项三:" + i.page_content + "##") | |
| if index == 3: | |
| temp.append("选项四:" + i.page_content + "##") | |
| system_prompt = ' '.join(temp) | |
| return system_prompt, query | |
| def chat(input, filter, embeddings): | |
| system_prompt, query = chat_prerequisites(input, filter, embeddings) | |
| logger.info('prerequisites satisfied') | |
| completion = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": query} | |
| ]) | |
| return completion.choices[0].message['content'], system_prompt | |
| def chat_data_cleaning(input): | |
| clean_prompt = '我要求你提取出这段文字中的岗位名称、岗位描述(用一句或者两句话概括),去除无关紧要的信息,比如工资,地点等等,并严格遵守"岗位名称: xxx # 岗位描述: xxx # "的格式进行回复' | |
| completion = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": clean_prompt}, | |
| {"role": "user", "content": clean_prompt + input} | |
| ]) | |
| return completion.choices[0].message['content'] | |
| def local_emb2pinecone(PINECONE_API_KEY, PINECONE_API_ENV, level, emb_path, text_path, delete=False): | |
| pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV) | |
| logger.info('Pinecone initialized') | |
| logger.info(pinecone.list_indexes()[0]) | |
| l = load_json(emb_path) | |
| print(f'level{level} loaded') | |
| with open(text_path, 'r', encoding='utf-8') as f: | |
| texts = f.readlines() | |
| texts = [i.replace('\n', '') for i in texts] | |
| index = pinecone.Index(pinecone.list_indexes()[0]) | |
| if delete: | |
| if input('press y to delete all the vectors: ') == 'y': | |
| index.delete(delete_all=True) | |
| logger.info('delete all') | |
| else: | |
| pass | |
| else: | |
| pass | |
| for key, value, text in zip(list(l.keys()), list(l.values()), texts): | |
| index.upsert([(key, value, {"text": text, "level": level})]) | |
| logger.info('upload successfully') |