kylin0421 commited on
Commit
24cd111
·
0 Parent(s):

Clean start

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ bfg.jar
2
+ ..bfg-report
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Poetic Mirror
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.29.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: 'Poetiv mirror is an interesting app. '
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from openai import OpenAI
6
+ import time
7
+ import csv
8
+ from data.to_poem_list import to_poem_list
9
+ import os
10
+ import gradio as gr
11
+ from huggingface_hub import hf_hub_download,login
12
+
13
+
14
+ hf_token = os.environ.get("HF_TOKEN")
15
+ login(token=hf_token)
16
+ #====Settings====
17
+ model_path = "slxhere/modern_ancientpoem_encoder"
18
+ poem_csv_path = hf_hub_download(
19
+ repo_id="slxhere/tang_poems",
20
+ repo_type="dataset",
21
+ filename="tang_poem.csv"
22
+ )
23
+ api_key = os.environ.get("DEEPSEEK_API_KEY")
24
+ base_url = "https://api.deepseek.com"
25
+ top_k = 5
26
+ embedding_cache_path = hf_hub_download(
27
+ repo_id="slxhere/poetic-mirror-cache-tang-embedding",
28
+ repo_type="dataset",
29
+ filename="cached_tang_embedding.npy"
30
+ )
31
+
32
+
33
+ print("Loading model and data...")
34
+ model = SentenceTransformer(model_path)
35
+ client = OpenAI(api_key=api_key, base_url=base_url)
36
+ poem_sentences = to_poem_list(poem_csv_path)
37
+
38
+ #========
39
+
40
+ if os.path.exists(embedding_cache_path):
41
+ poem_embeddings = np.load(embedding_cache_path)
42
+ else:
43
+ print("Cached embeddings not found! Encoding... This might take some time...")
44
+ poem_embeddings = model.encode(
45
+ poem_sentences, batch_size=64, show_progress_bar=True, normalize_embeddings=True
46
+ )
47
+ np.save(embedding_cache_path, poem_embeddings)
48
+ print(f"Embedding saved to {embedding_cache_path}")
49
+
50
+
51
+ def rerank_with_llm(modern, candidates):
52
+ prompt = f"""
53
+ 我说了一句话:“{modern}”,你觉得下面哪一句古诗最能表达这句话的情绪与意境?
54
+
55
+ """
56
+ for i, c in enumerate(candidates):
57
+ prompt += f"{i+1}. {c}\n"
58
+ prompt += "\n请直接回复最匹配的一句编号(如 2),不要解释。"
59
+
60
+ try:
61
+ resp = client.chat.completions.create(
62
+ model="deepseek-chat",
63
+ messages=[
64
+ {"role": "system", "content": "你是古诗匹配专家。"},
65
+ {"role": "user", "content": prompt}
66
+ ]
67
+ )
68
+ reply = resp.choices[0].message.content.strip()
69
+ for line in reply.splitlines():
70
+ if line.strip().isdigit():
71
+ idx = int(line.strip()) - 1
72
+ if 0 <= idx < len(candidates):
73
+ return idx
74
+ except Exception as e:
75
+ print("LLM error: ", e)
76
+ return 0
77
+
78
+
79
+ def retrieve_and_rerank(modern_sentence):
80
+ start_time = time.time()
81
+ emb = model.encode([modern_sentence], normalize_embeddings=True)
82
+ sims = cosine_similarity(emb, poem_embeddings)[0]
83
+
84
+ top_k_idx = sims.argsort()[-top_k:][::-1]
85
+ top_k_sims = sims[top_k_idx]
86
+ top_k_poems = [poem_sentences[i] for i in top_k_idx]
87
+
88
+ rerank_idx = rerank_with_llm(modern_sentence, top_k_poems)
89
+
90
+ scores = np.exp(top_k_sims - np.max(top_k_sims))
91
+ probs = scores / scores.sum()
92
+
93
+ results = [{
94
+ "poem": top_k_poems[i],
95
+ "score": round(float(probs[i]), 4),
96
+ "(LLM selected)": i == rerank_idx
97
+ } for i in range(top_k)]
98
+
99
+ print(f"Reaction time: {time.time() - start_time:.2f}s")
100
+ return results
101
+
102
+
103
+ def poetry_matcher(input_text):
104
+ results = retrieve_and_rerank(input_text)
105
+ return "\n".join(
106
+ [f"{'✅' if r['(LLM selected)'] else ' '} [{r['score']}] {r['poem']}" for r in results]
107
+ )
108
+
109
+ iface = gr.Interface(
110
+ fn=poetry_matcher,
111
+ inputs=gr.Textbox(lines=2, placeholder="Enter your sentence..."),
112
+ outputs="text",
113
+ title="🔭 Poetic Mirror 🖌",
114
+ description="穿越千年诗意,为你精准匹配最契合的古诗名句——输入你的句子,邂逅古人共鸣。\nTravel through a thousand years of poetry—enter your sentence, and we'll find the most matching Tang dynasty verse for you."
115
+ )
116
+
117
+ iface.launch()
118
+
data/__pycache__/to_poem_list.cpython-310.pyc ADDED
Binary file (804 Bytes). View file
 
data/to_poem_list.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+
4
+ def to_poem_list(file_path,debug=False):
5
+ poems = []
6
+
7
+ with open(file_path, "r", encoding="utf-8") as f:
8
+ reader = csv.DictReader(f)
9
+ for row in reader:
10
+ content = row["内容"].strip().replace(" ", "") # 去除空格
11
+ if not content:
12
+ continue
13
+ # 按中文句号分句
14
+ for part in content.split("。"):
15
+ part = part.strip()
16
+ if len(part) >= 9: # 避免短语/乱码 至少为单句七绝
17
+ poems.append(part + "。") # 补回句号
18
+
19
+
20
+ if debug:
21
+ print(f"共提取单句:{len(poems)} 条")
22
+ print("示例:", poems[:10])
23
+ print(len(poems[0]))
24
+
25
+ return poems
26
+
27
+
28
+
29
+ if __name__=="__main__":
30
+ to_poem_list("Poetry/宋_1.csv",True)
requirements.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.7.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.11.18
5
+ aiolimiter==1.2.1
6
+ aiosignal==1.3.2
7
+ annotated-types==0.7.0
8
+ anyio==4.9.0
9
+ async-timeout==5.0.1
10
+ attrs==25.3.0
11
+ certifi==2025.4.26
12
+ charset-normalizer==3.4.2
13
+ click==8.1.8
14
+ datasets==3.6.0
15
+ dill==0.3.8
16
+ distro==1.9.0
17
+ exceptiongroup==1.3.0
18
+ fastapi==0.115.12
19
+ ffmpy==0.5.0
20
+ filelock==3.18.0
21
+ frozenlist==1.6.0
22
+ fsspec==2025.3.0
23
+ gradio==5.29.1
24
+ gradio_client==1.10.1
25
+ groovy==0.1.2
26
+ h11==0.16.0
27
+ httpcore==1.0.9
28
+ httpx==0.28.1
29
+ huggingface-hub==0.31.2
30
+ idna==3.10
31
+ Jinja2==3.1.6
32
+ jiter==0.9.0
33
+ joblib==1.5.0
34
+ markdown-it-py==3.0.0
35
+ MarkupSafe==3.0.2
36
+ mdurl==0.1.2
37
+ mpmath==1.3.0
38
+ multidict==6.4.3
39
+ multiprocess==0.70.16
40
+ networkx==3.4.2
41
+ numpy==2.2.5
42
+ nvidia-cublas-cu12==12.6.4.1
43
+ nvidia-cuda-cupti-cu12==12.6.80
44
+ nvidia-cuda-nvrtc-cu12==12.6.77
45
+ nvidia-cuda-runtime-cu12==12.6.77
46
+ nvidia-cudnn-cu12==9.5.1.17
47
+ nvidia-cufft-cu12==11.3.0.4
48
+ nvidia-cufile-cu12==1.11.1.6
49
+ nvidia-curand-cu12==10.3.7.77
50
+ nvidia-cusolver-cu12==11.7.1.2
51
+ nvidia-cusparse-cu12==12.5.4.2
52
+ nvidia-cusparselt-cu12==0.6.3
53
+ nvidia-nccl-cu12==2.26.2
54
+ nvidia-nvjitlink-cu12==12.6.85
55
+ nvidia-nvtx-cu12==12.6.77
56
+ openai==1.78.1
57
+ orjson==3.10.18
58
+ packaging==25.0
59
+ pandas==2.2.3
60
+ pillow==11.2.1
61
+ propcache==0.3.1
62
+ psutil==7.0.0
63
+ pyarrow==20.0.0
64
+ pydantic==2.11.4
65
+ pydantic_core==2.33.2
66
+ pydub==0.25.1
67
+ Pygments==2.19.1
68
+ python-dateutil==2.9.0.post0
69
+ python-multipart==0.0.20
70
+ pytz==2025.2
71
+ PyYAML==6.0.2
72
+ regex==2024.11.6
73
+ requests==2.32.3
74
+ rich==14.0.0
75
+ ruff==0.11.10
76
+ safehttpx==0.1.6
77
+ safetensors==0.5.3
78
+ scikit-learn==1.6.1
79
+ scipy==1.15.3
80
+ semantic-version==2.10.0
81
+ sentence-transformers==4.1.0
82
+ shellingham==1.5.4
83
+ six==1.17.0
84
+ sniffio==1.3.1
85
+ starlette==0.46.2
86
+ sympy==1.14.0
87
+ tenacity==9.1.2
88
+ threadpoolctl==3.6.0
89
+ tokenizers==0.21.1
90
+ tomlkit==0.13.2
91
+ torch==2.7.0
92
+ tqdm==4.67.1
93
+ transformers==4.51.3
94
+ triton==3.3.0
95
+ typer==0.15.4
96
+ typing-inspection==0.4.0
97
+ typing_extensions==4.13.2
98
+ tzdata==2025.2
99
+ urllib3==2.4.0
100
+ uvicorn==0.34.2
101
+ websockets==15.0.1
102
+ xxhash==3.5.0
103
+ yarl==1.20.0