Spaces:

shigureui
/

BookSearch

Sleeping

App Files Files Community

BookSearch / app.py

shigureui

fix

4b5811c 8 months ago

raw

history blame contribute delete

4.3 kB

	import gradio as gr
	# import llama_cpp
	import base64
	from Crypto.Cipher import AES
	from Crypto.Util.Padding import unpad


	def decrypt_file(input_path, key):
	# 读取加密文件
	with open(input_path, "rb") as f:
	encrypted_data = base64.b64decode(f.read())

	key = key.ljust(32, "0")[:32].encode("utf-8")
	iv = encrypted_data[:16]
	ciphertext = encrypted_data[16:]

	cipher = AES.new(key, AES.MODE_CBC, iv)
	plaintext = unpad(cipher.decrypt(ciphertext), AES.block_size)

	return plaintext.decode("utf-8")


	# llm = llama_cpp.Llama.from_pretrained(
	# repo_id="mradermacher/bge-large-zh-v1.5-GGUF",
	# filename="bge-large-zh-v1.5.Q4_K_M.gguf",
	# embedding=True,
	# )

	# embedding_1 = llm.create_embedding("Hello, world!")
	# embedding_2 = llm.create_embedding("你好, 世界!") # type(embedding_1['data'][0]['embedding']) list
	from openai import OpenAI
	import os

	client_oai = OpenAI(
	api_key=os.getenv("DASHSCOPE_API_KEY"),
	base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" # 百炼服务的base_url
	)


	from pymilvus import MilvusClient

	client = MilvusClient("./books.db")

	client.create_collection(collection_name="collection_1", dimension=1024)

	import os, json

	aeskey = os.getenv("aeskey")
	decrypted_content = decrypt_file("encrypted.txt", aeskey)
	raw_jsons = json.loads(decrypted_content)


	with open("embeddings.json", mode="r") as embedding_file:
	all_embs = json.load(embedding_file)


	for vhjx_index, vhjx_item in enumerate(raw_jsons):
	chapter = vhjx_item[0]

	docs = []
	metas = []
	for jvvi_item in vhjx_item[1:]:
	content = jvvi_item["原文"]
	docs.append(content)
	metas.append(
	{
	"index": jvvi_item["index"],
	"text": content,
	"annotation": jvvi_item.get("注释", ""),
	"critique": jvvi_item.get("批判", ""),
	"chapter": chapter,
	}
	)

	# 一个章节一次
	# 批量生成 embeddings（每个为 list[float]）
	# emb_result = llm.create_embedding(docs)
	embeddings = all_embs[vhjx_index] # List[List[float]]
	print(len(embeddings))
	# 准备数据
	milvus_data = []
	for i, emb in enumerate(embeddings):
	item = metas[i]
	milvus_data.append(
	{
	"id": vhjx_index * 100 + i,
	"index": item["index"],
	"vector": emb,
	"text": item["text"],
	"annotation": item["annotation"],
	"critique": item["critique"],
	"chapter": item["chapter"],
	}
	)
	print(f"✅ 共 {len(milvus_data)} 条数据")

	# 插入数据
	client.insert(collection_name="collection_1", data=milvus_data)
	print(f"✅ 插入完成：共 {len(milvus_data)} 条数据")


	def greet(name):
	"""
	Search for relevant critical commentary entries based on an input query from the Analects.

	This function parses the input query, performs a fuzzy search in the indexed original text field,
	and extracts related critiques.

	Args:
	query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search.

	Returns:
	List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries
	under the key "extra", retrieved via index references mentioned in the commentary.
	"""
	# embeddings = llm.create_embedding(name)
	completion = client_oai.embeddings.create(
	model="text-embedding-v3",
	input=name,
	dimensions=1024, # 仅 text-embedding-v3 支持
	encoding_format="float"
	)
	res = client.search(
	collection_name="collection_1",
	# data=[embeddings["data"][0]["embedding"]],
	data=[completion.data[0].embedding],
	limit=5,
	output_fields=["index", "text", "annotation", "critique"],
	)
	return res


	demo = gr.Interface(
	fn=greet,
	inputs=gr.Textbox(label="输入部分原文句子"),
	outputs=gr.JSON(label="查询结果"),
	title="论语批判MCP (Embedding版本)",
	description="输入模糊的论语原文，可以向量检索到对应的批判内容。",
	)
	demo.launch(mcp_server=True)