Spaces:
Sleeping
Sleeping
Rename ap.py to app.py
Browse files- ap.py → app.py +40 -40
ap.py → app.py
RENAMED
|
@@ -22,17 +22,18 @@ TOKENIZER_PATH = "bpe.model"
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
-
"https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/encoder_fit.weights.h5?download=true",
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
-
"https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/bpe.model?download=true",
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
| 35 |
MAX_LEN = 384
|
|
|
|
| 36 |
EMBED_DIM = 512
|
| 37 |
LATENT_DIM = 512
|
| 38 |
BATCH_SIZE = 768 # global batch size (Keras/TPU가 replica-wise로 나눠서 처리)
|
|
@@ -190,48 +191,47 @@ encoder = SentenceEncoder(vocab_size=vocab_size)
|
|
| 190 |
encoder(np.zeros((1, MAX_LEN), dtype=np.int32)) # 모델 빌드
|
| 191 |
encoder.load_weights(MODEL_PATH)
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
| 200 |
|
| 201 |
-
# ===============================
|
| 202 |
-
# 5️⃣ 가장 비슷한 문장 찾기
|
| 203 |
-
# ===============================
|
| 204 |
-
def find_most_similar(query, s1, s2, s3):
|
| 205 |
-
candidates = [s1, s2, s3]
|
| 206 |
-
candidate_vectors = np.stack([get_sentence_vector(c) for c in candidates]).astype(np.float32)
|
| 207 |
-
query_vector = get_sentence_vector(query)
|
| 208 |
-
|
| 209 |
-
sims = candidate_vectors @ query_vector # cosine similarity
|
| 210 |
-
top_idx = np.argmax(sims)
|
| 211 |
-
|
| 212 |
-
return {
|
| 213 |
-
"가장 비슷한 문장": candidates[top_idx],
|
| 214 |
-
"유사도": float(sims[top_idx])
|
| 215 |
-
}
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
with gr.Blocks() as demo:
|
| 221 |
-
gr.Markdown("##
|
| 222 |
-
|
| 223 |
-
query_input = gr.Textbox(label="검색할 문장 (Query)", placeholder="여기에 입력")
|
| 224 |
with gr.Row():
|
| 225 |
-
|
| 226 |
-
s2_input = gr.Textbox(label="검색 후보 2")
|
| 227 |
-
s3_input = gr.Textbox(label="검색 후보 3")
|
| 228 |
-
output = gr.JSON(label="결과")
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
| 236 |
|
| 237 |
demo.launch()
|
|
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
+
"https://huggingface.co/OpenLab-NLP/openlem2-retrieval-qa/resolve/main/encoder_fit.weights.h5?download=true",
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
+
"https://huggingface.co/OpenLab-NLP/openlem2-retrieval-qa/resolve/main/bpe.model?download=true",
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
| 35 |
MAX_LEN = 384
|
| 36 |
+
TOP_K = 3
|
| 37 |
EMBED_DIM = 512
|
| 38 |
LATENT_DIM = 512
|
| 39 |
BATCH_SIZE = 768 # global batch size (Keras/TPU가 replica-wise로 나눠서 처리)
|
|
|
|
| 191 |
encoder(np.zeros((1, MAX_LEN), dtype=np.int32)) # 모델 빌드
|
| 192 |
encoder.load_weights(MODEL_PATH)
|
| 193 |
|
| 194 |
+
def tokenize(texts):
|
| 195 |
+
token_ids = []
|
| 196 |
+
for t in texts:
|
| 197 |
+
ids = sp.encode(t, out_type=int)[:MAX_LEN]
|
| 198 |
+
if len(ids) < MAX_LEN:
|
| 199 |
+
ids += [pad_id]*(MAX_LEN-len(ids))
|
| 200 |
+
token_ids.append(ids)
|
| 201 |
+
return np.array(token_ids, dtype=np.int32)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
def search_and_answer(query, docs_text):
|
| 205 |
+
docs = [d.strip() for d in docs_text.split("\n") if d.strip()]
|
| 206 |
+
if not docs:
|
| 207 |
+
return [], "문서를 한 줄씩 입력하세요."
|
| 208 |
+
|
| 209 |
+
q_ids = tokenize([query])
|
| 210 |
+
d_ids = tokenize(docs)
|
| 211 |
+
|
| 212 |
+
q_emb = encoder(q_ids, training=False).numpy()
|
| 213 |
+
d_embs = encoder(d_ids, training=False).numpy()
|
| 214 |
+
|
| 215 |
+
scores = np.dot(q_emb, d_embs.T)[0]
|
| 216 |
+
top_k_idx = scores.argsort()[::-1][:min(TOP_K, len(docs))]
|
| 217 |
+
top_docs = [(docs[i], float(scores[i])) for i in top_k_idx]
|
| 218 |
+
|
| 219 |
+
answer = docs[top_k_idx[0]]
|
| 220 |
+
return top_docs, answer
|
| 221 |
+
|
| 222 |
with gr.Blocks() as demo:
|
| 223 |
+
gr.Markdown("## OpenLEM2 Retrieval-QA 데모 (사용자 문서 입력 가능)")
|
| 224 |
+
|
|
|
|
| 225 |
with gr.Row():
|
| 226 |
+
query_input = gr.Textbox(label="질문/쿼리", placeholder="예: 서울 날씨 어때?")
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
docs_input = gr.Textbox(label="문서 리스트 (한 줄씩)", placeholder="문서를 한 줄씩 입력하세요.", lines=10)
|
| 229 |
+
|
| 230 |
+
with gr.Row():
|
| 231 |
+
top_docs_out = gr.Dataframe(headers=["Document", "Score"], max_rows=TOP_K)
|
| 232 |
+
answer_out = gr.Textbox(label="답변")
|
| 233 |
+
|
| 234 |
+
run_btn = gr.Button("검색/QA 실행")
|
| 235 |
+
run_btn.click(fn=search_and_answer, inputs=[query_input, docs_input], outputs=[top_docs_out, answer_out])
|
| 236 |
|
| 237 |
demo.launch()
|