Sayknow_v1 / app.py
SayknowLab's picture
Update app.py
5d78baa verified
import pandas as pd
import torch
from flask import Flask, request, Response, render_template_string
from transformers import AutoTokenizer, GPT2LMHeadModel
from dicttoxml import dicttoxml
import re
import traceback
app = Flask(__name__)
# --- hCaptcha ์„ค์ • ๊ด€๋ จ ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ๋จ ---
# 1. ๋ชจ๋ธ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘...")
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
# 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
try:
df = pd.read_excel('dataset.xlsx')
knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist()
except Exception as e:
print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
knowledge_list = []
def find_relevant_context(query, top_n=2):
"""์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜ (๊ธฐ์กด๊ณผ ๋™์ผ)"""
query_words = query.replace(" ", "").lower()
relevant_sentences = []
for s in knowledge_list:
s_text = str(s).replace(" ", "").replace("\n", "").lower()
if any(word.replace(" ", "") in s_text for word in query.split()):
relevant_sentences.append(s)
if relevant_sentences:
return " ".join(str(s) for s in relevant_sentences[:top_n])
return ""
def ask_sayknow(query):
try:
context = find_relevant_context(query)
persona_guide = (
"๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
"๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์œผ๋กœ 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
"์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
)
info = context if context else "์ •๋ณด ์—†์Œ"
prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
# ์ด์ „ ๋‹ต๋ณ€ ๋กœ์ง ๊ฐœ์„  (attention_mask ์ถ”๊ฐ€) - ์ด ๋ถ€๋ถ„์€ ์ž˜ ์ž‘๋™ํ•˜๊ณ  ์žˆ์„ ๊ฑฐ์•ผ!
tokenizer.pad_token = tokenizer.eos_token
encoded_input = tokenizer.encode_plus(
prompt,
return_tensors='pt',
truncation=True,
padding=True
)
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']
model.eval()
with torch.no_grad():
gen_ids = model.generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=512, # ๋‹ต๋ณ€์ด ์ž˜๋ฆฌ๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์กฐ๊ธˆ ๋Š˜๋ ค๋ดค์–ด! (60 -> 80)
min_length=5,
repetition_penalty=1.3,
do_sample=True,
top_k=30,
top_p=0.85,
pad_token_id=tokenizer.pad_token_id,
temperature=0.5,
num_beams=1
)
raw_response = tokenizer.decode(gen_ids[0], skip_special_tokens=True) # ์›๋ณธ ์‘๋‹ต ์ €์žฅ
# --- ์‘๋‹ต ์ฒ˜๋ฆฌ ๋กœ์ง ๊ฐœ์„  ๋ฒ„์ „ (index out of range ์—๋Ÿฌ ๋ฐฉ์ง€) ---
# 1. ๋ชจ๋ธ์ด ์ƒ์„ฑํ•œ ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ํ”„๋กฌํ”„ํŠธ ๋ถ€๋ถ„ ์ž๋ฅด๊ธฐ (๋ฐ˜๋ณต๋˜๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€)
# prompt๊ฐ€ raw_response์˜ ์‹œ์ž‘ ๋ถ€๋ถ„์— ์žˆ๋‹ค๋ฉด ๊ทธ ๋ถ€๋ถ„์„ ์ž˜๋ผ๋‚ผ๊ฒŒ.
if raw_response.startswith(prompt):
extracted_answer = raw_response[len(prompt):].strip()
else:
extracted_answer = raw_response.strip()
# 2. '๋‹ต๋ณ€:' ํ‚ค์›Œ๋“œ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ง„์งœ ๋‹ต๋ณ€ ๋ถ€๋ถ„ ์ถ”์ถœ
if "๋‹ต๋ณ€:" in extracted_answer:
answer = extracted_answer.split("๋‹ต๋ณ€:", 1)[1].strip() # ์ฒซ ๋ฒˆ์งธ "๋‹ต๋ณ€:" ์ดํ›„๋งŒ
else:
# ๋งŒ์•ฝ "๋‹ต๋ณ€:" ํƒœ๊ทธ๊ฐ€ ์—†์œผ๋ฉด, ํ”„๋กฌํ”„ํŠธ์˜ ์ง€์‹œ์‚ฌํ•ญ ์ค‘๋ณต ๋“ฑ์„ ์ œ๊ฑฐ ์‹œ๋„
persona_end_marker = "๋‹ตํ•ด.\n" # persona_guide์˜ ํŠน์ • ๋ ๋ถ€๋ถ„์„ ํ‘œ์‹œ
if persona_end_marker in extracted_answer:
try:
answer = extracted_answer[extracted_answer.rindex(persona_end_marker) + len(persona_end_marker):].strip()
except ValueError:
answer = extracted_answer # ์•ˆ๋˜๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
else:
answer = extracted_answer # ๊ทธ๊ฒƒ๋„ ์—†์œผ๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
# ๊ทธ๋ž˜๋„ ๋‹ต๋ณ€์ด ๋น„์–ด์žˆ์œผ๋ฉด ์˜ค๋ฅ˜ ๋ฉ”์‹œ์ง€๋ฅผ ๋Œ€์ฒด
if not answer:
answer = "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†๊ฑฐ๋‚˜ ๋‚ด์šฉ์ด ๋ช…ํ™•ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."
# 1. ์˜๋ฏธ ์—†๋Š” ์ˆ˜์‹/์˜๋ฌธ/ํŠน์ˆ˜๋ฌธ์ž/๋ฐ˜๋ณต๋ฌธ์ž ๋“ฑ ํ•„ํ„ฐ๋ง (๊ธฐ์กด๊ณผ ๋™์ผ)
# ์ด ๋ถ€๋ถ„์„ ๋จผ์ € ํ•œ๋ฒˆ ์ ์šฉํ•ด์„œ answer๊ฐ€ ์—‰๋šฑํ•œ ๋ฌธ์ž์—ด์ด ๋˜๋Š” ๊ฑธ ๋ฐฉ์ง€
answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
answer = re.sub(r"[a-zA-Z]+", "", answer)
answer = re.sub(r"[=^*/\\]+", "", answer)
answer = re.sub(r"\s+", " ", answer).strip()
# 2. 80์ž ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ (ํ•œ๊ธ€ ๊ธฐ์ค€) (๊ธฐ์กด๊ณผ ๋™์ผ)
def truncate_korean(text, max_len=80):
count = 0
result = ""
for ch in text:
result += ch
count += 1
if count >= max_len:
break
return result
answer = truncate_korean(answer, 80)
# 3. ๋ฌธ์žฅ ๋์ด ์ž์—ฐ์Šค๋Ÿฝ์ง€ ์•Š์œผ๋ฉด ๋งˆ์นจํ‘œ ์ถ”๊ฐ€
if answer and answer[-1] not in ".!?":
answer += "."
elif not answer: # ๋นˆ ๋ฌธ์ž์—ด์ธ๋ฐ '.' ์ฐ์œผ๋ฉด ์—๋Ÿฌ๋‚˜๋‹ˆ ํ•œ๋ฒˆ ๋” ์ฒดํฌ
answer = "์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค." # ์ตœํ›„์˜ ๋ณด๋ฃจ
return answer
except Exception as e:
print(f"ask_sayknow ์—๋Ÿฌ: {e}")
traceback.print_exc()
return f"๋‚ด๋ถ€ ์˜ค๋ฅ˜: {str(e)}" # ์™ธ๋ถ€ ์‚ฌ์šฉ์ž์—๊ฒŒ ๋ณด์ด๋Š” ๋ฉ”์‹œ์ง€!
# 3. API (XML ์‘๋‹ต ์œ ์ง€) (๊ธฐ์กด๊ณผ ๋™์ผ)
@app.route('/chatapi.html', methods=['GET'])
@app.route('/index.html', methods=['GET'])
def chat_api():
query = request.args.get('askdata', '')
if not query:
result = {"status": "error", "message": "No data"}
else:
try:
answer = ask_sayknow(query)
result = {
"service": "Sayknow",
"question": query,
"answer": answer
}
except Exception as e:
print(f"chat_api ์—๋Ÿฌ: {e}")
traceback.print_exc()
result = {
"service": "Sayknow",
"question": query,
"answer": f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}",
"error": str(e)
}
xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
return Response(xml_output, mimetype='text/xml')
# 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€) - hCaptcha ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ!
@app.route('/', methods=['GET', 'POST'])
def index():
answer = ""
question = ""
# error_message ์ œ๊ฑฐ
if request.method == "POST":
question = request.form.get('question', '')
# hcaptcha_response ๊ด€๋ จ ๋กœ์ง ์ œ๊ฑฐ
# hCaptcha ๊ฒ€์ฆ ๋กœ์ง ์ œ๊ฑฐ
if question: # ์งˆ๋ฌธ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ!
answer = ask_sayknow(question)
html = f"""
<html>
<head>
<title>Sayknow ์ฑ—๋ด‡</title>
<!-- hCaptcha ์Šคํฌ๋ฆฝํŠธ ์ œ๊ฑฐ -->
</head>
<body>
<h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
<form method="post" action="/">
<input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
<br/><br/>
<!-- hCaptcha ์œ„์ ฏ ์ œ๊ฑฐ -->
<!-- ์—๋Ÿฌ ๋ฉ”์‹œ์ง€ ๋ณด์—ฌ์ฃผ๋Š” ๋ถ€๋ถ„ ์ œ๊ฑฐ -->
<br/>
<input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
</form>
<hr>
<h3>๋‹ต๋ณ€:</h3>
<p style="white-space: pre-wrap;">{answer}</p>
</body>
</html>
"""
return render_template_string(html)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)