OpenLab-NLP commited on
Commit
fef685f
·
verified ·
1 Parent(s): 3ad824e

Rename ap.py to app.py

Browse files
Files changed (1) hide show
  1. ap.py → app.py +40 -40
ap.py → app.py RENAMED
@@ -22,17 +22,18 @@ TOKENIZER_PATH = "bpe.model"
22
 
23
  if not os.path.exists(MODEL_PATH):
24
  download_file(
25
- "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/encoder_fit.weights.h5?download=true",
26
  MODEL_PATH
27
  )
28
 
29
  if not os.path.exists(TOKENIZER_PATH):
30
  download_file(
31
- "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/bpe.model?download=true",
32
  TOKENIZER_PATH
33
  )
34
 
35
  MAX_LEN = 384
 
36
  EMBED_DIM = 512
37
  LATENT_DIM = 512
38
  BATCH_SIZE = 768 # global batch size (Keras/TPU가 replica-wise로 나눠서 처리)
@@ -190,48 +191,47 @@ encoder = SentenceEncoder(vocab_size=vocab_size)
190
  encoder(np.zeros((1, MAX_LEN), dtype=np.int32)) # 모델 빌드
191
  encoder.load_weights(MODEL_PATH)
192
 
193
- # ===============================
194
- # 4️⃣ 벡터화 함수
195
- # ===============================
196
- def get_sentence_vector(sentence):
197
- tokens = pad_sentence(encode_sentence(sentence))
198
- vec = encoder(np.array([tokens])).numpy()[0]
199
- return vec / np.linalg.norm(vec)
 
200
 
201
- # ===============================
202
- # 5️⃣ 가장 비슷한 문장 찾기
203
- # ===============================
204
- def find_most_similar(query, s1, s2, s3):
205
- candidates = [s1, s2, s3]
206
- candidate_vectors = np.stack([get_sentence_vector(c) for c in candidates]).astype(np.float32)
207
- query_vector = get_sentence_vector(query)
208
-
209
- sims = candidate_vectors @ query_vector # cosine similarity
210
- top_idx = np.argmax(sims)
211
-
212
- return {
213
- "가장 비슷한 문장": candidates[top_idx],
214
- "유사도": float(sims[top_idx])
215
- }
216
 
217
- # ===============================
218
- # 6️⃣ Gradio UI
219
- # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  with gr.Blocks() as demo:
221
- gr.Markdown("## 🔍 문장 유사도 검색기 (쿼리 1개 + 후보 3개)")
222
- with gr.Row():
223
- query_input = gr.Textbox(label="검색할 문장 (Query)", placeholder="여기에 입력")
224
  with gr.Row():
225
- s1_input = gr.Textbox(label="검색 후보 1")
226
- s2_input = gr.Textbox(label="검색 후보 2")
227
- s3_input = gr.Textbox(label="검색 후보 3")
228
- output = gr.JSON(label="결과")
229
 
230
- search_btn = gr.Button("가장 비슷한 문장 찾기")
231
- search_btn.click(
232
- fn=find_most_similar,
233
- inputs=[query_input, s1_input, s2_input, s3_input],
234
- outputs=output
235
- )
 
 
236
 
237
  demo.launch()
 
22
 
23
  if not os.path.exists(MODEL_PATH):
24
  download_file(
25
+ "https://huggingface.co/OpenLab-NLP/openlem2-retrieval-qa/resolve/main/encoder_fit.weights.h5?download=true",
26
  MODEL_PATH
27
  )
28
 
29
  if not os.path.exists(TOKENIZER_PATH):
30
  download_file(
31
+ "https://huggingface.co/OpenLab-NLP/openlem2-retrieval-qa/resolve/main/bpe.model?download=true",
32
  TOKENIZER_PATH
33
  )
34
 
35
  MAX_LEN = 384
36
+ TOP_K = 3
37
  EMBED_DIM = 512
38
  LATENT_DIM = 512
39
  BATCH_SIZE = 768 # global batch size (Keras/TPU가 replica-wise로 나눠서 처리)
 
191
  encoder(np.zeros((1, MAX_LEN), dtype=np.int32)) # 모델 빌드
192
  encoder.load_weights(MODEL_PATH)
193
 
194
+ def tokenize(texts):
195
+ token_ids = []
196
+ for t in texts:
197
+ ids = sp.encode(t, out_type=int)[:MAX_LEN]
198
+ if len(ids) < MAX_LEN:
199
+ ids += [pad_id]*(MAX_LEN-len(ids))
200
+ token_ids.append(ids)
201
+ return np.array(token_ids, dtype=np.int32)
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ def search_and_answer(query, docs_text):
205
+ docs = [d.strip() for d in docs_text.split("\n") if d.strip()]
206
+ if not docs:
207
+ return [], "문서를 한 줄씩 입력하세요."
208
+
209
+ q_ids = tokenize([query])
210
+ d_ids = tokenize(docs)
211
+
212
+ q_emb = encoder(q_ids, training=False).numpy()
213
+ d_embs = encoder(d_ids, training=False).numpy()
214
+
215
+ scores = np.dot(q_emb, d_embs.T)[0]
216
+ top_k_idx = scores.argsort()[::-1][:min(TOP_K, len(docs))]
217
+ top_docs = [(docs[i], float(scores[i])) for i in top_k_idx]
218
+
219
+ answer = docs[top_k_idx[0]]
220
+ return top_docs, answer
221
+
222
  with gr.Blocks() as demo:
223
+ gr.Markdown("## OpenLEM2 Retrieval-QA 데모 (사용자 문서 입력 가능)")
224
+
 
225
  with gr.Row():
226
+ query_input = gr.Textbox(label="질문/쿼리", placeholder="예: 서울 날씨 어때?")
 
 
 
227
 
228
+ docs_input = gr.Textbox(label="문서 리스트 (한 줄씩)", placeholder="문서를 한 줄씩 입력하세요.", lines=10)
229
+
230
+ with gr.Row():
231
+ top_docs_out = gr.Dataframe(headers=["Document", "Score"], max_rows=TOP_K)
232
+ answer_out = gr.Textbox(label="답변")
233
+
234
+ run_btn = gr.Button("검색/QA 실행")
235
+ run_btn.click(fn=search_and_answer, inputs=[query_input, docs_input], outputs=[top_docs_out, answer_out])
236
 
237
  demo.launch()