SayknowLab commited on
Commit
be972b4
ยท
verified ยท
1 Parent(s): b95f96d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -18
app.py CHANGED
@@ -8,23 +8,25 @@ import traceback
8
 
9
  app = Flask(__name__)
10
 
11
- # 1. ๋ชจ๋ธ ๋กœ๋“œ
 
 
12
  print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘...")
13
  tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
14
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
15
  model = GPT2LMHeadModel.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
16
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
17
 
18
- # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (์—‘์…€ ํ•œ ์ปฌ๋Ÿผ์— ์ง€์‹ ๋ฐ์ดํ„ฐ ์žˆ์„ ๋•Œ)
19
  try:
20
  df = pd.read_excel('dataset.xlsx')
21
- knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist() # ์ปฌ๋Ÿผ๋ช… ์ •ํ™•ํžˆ ๋งž์ถฐ์•ผ ํ•ด!
22
  except Exception as e:
23
  print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
24
  knowledge_list = []
25
 
26
  def find_relevant_context(query, top_n=2):
27
- """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜"""
28
  query_words = query.replace(" ", "").lower()
29
  relevant_sentences = []
30
  for s in knowledge_list:
@@ -40,18 +42,28 @@ def ask_sayknow(query):
40
  context = find_relevant_context(query)
41
  persona_guide = (
42
  "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
43
- "๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
44
  "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
45
  )
46
  info = context if context else "์ •๋ณด ์—†์Œ"
47
  prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
48
 
49
- input_ids = tokenizer.encode(prompt, return_tensors='pt')
 
 
 
 
 
 
 
 
 
50
  model.eval()
51
  with torch.no_grad():
52
  gen_ids = model.generate(
53
  input_ids,
54
- max_new_tokens=1500,
 
55
  min_length=5,
56
  repetition_penalty=1.3,
57
  do_sample=True,
@@ -61,18 +73,44 @@ def ask_sayknow(query):
61
  temperature=0.5,
62
  num_beams=1
63
  )
64
- response = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
65
- # ๋‹ต๋ณ€๋งŒ ์ถ”์ถœ
66
- answer = response.split("๋‹ต๋ณ€:")[-1].strip() if "๋‹ต๋ณ€:" in response else response.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # ์˜๋ฏธ ์—†๋Š” ๋ฌธ์ž ํ•„ํ„ฐ๋ง
 
 
 
 
 
 
69
  answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
70
  answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
71
  answer = re.sub(r"[a-zA-Z]+", "", answer)
72
  answer = re.sub(r"[=^*/\\]+", "", answer)
73
  answer = re.sub(r"\s+", " ", answer).strip()
74
 
75
- # 80์ž ์ด๋‚ด ์ž๋ฅด๊ธฐ
76
  def truncate_korean(text, max_len=80):
77
  count = 0
78
  result = ""
@@ -84,16 +122,19 @@ def ask_sayknow(query):
84
  return result
85
  answer = truncate_korean(answer, 80)
86
 
87
- # ๋ฌธ์žฅ ๋ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ
88
- if answer and answer[-1] not in ".!?":
89
  answer += "."
 
 
 
90
  return answer
91
  except Exception as e:
92
  print(f"ask_sayknow ์—๋Ÿฌ: {e}")
93
  traceback.print_exc()
94
- return f"์˜ค๋ฅ˜: {str(e)}"
95
 
96
- # 3. API (XML ์‘๋‹ต ์œ ์ง€)
97
  @app.route('/chatapi.html', methods=['GET'])
98
  @app.route('/index.html', methods=['GET'])
99
  def chat_api():
@@ -121,25 +162,35 @@ def chat_api():
121
  xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
122
  return Response(xml_output, mimetype='text/xml')
123
 
124
- # 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€)
125
  @app.route('/', methods=['GET', 'POST'])
126
  def index():
127
  answer = ""
128
  question = ""
 
 
129
  if request.method == "POST":
130
  question = request.form.get('question', '')
131
- if question:
 
 
 
132
  answer = ask_sayknow(question)
133
 
134
  html = f"""
135
  <html>
136
  <head>
137
  <title>Sayknow ์ฑ—๋ด‡</title>
 
138
  </head>
139
  <body>
140
  <h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
141
  <form method="post" action="/">
142
  <input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
 
 
 
 
143
  <input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
144
  </form>
145
  <hr>
 
8
 
9
  app = Flask(__name__)
10
 
11
+ # --- hCaptcha ์„ค์ • ๊ด€๋ จ ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ๋จ ---
12
+
13
+ # 1. ๋ชจ๋ธ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
14
  print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘...")
15
  tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
16
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
17
  model = GPT2LMHeadModel.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
18
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
19
 
20
+ # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
21
  try:
22
  df = pd.read_excel('dataset.xlsx')
23
+ knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist()
24
  except Exception as e:
25
  print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
26
  knowledge_list = []
27
 
28
  def find_relevant_context(query, top_n=2):
29
+ """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜ (๊ธฐ์กด๊ณผ ๋™์ผ)"""
30
  query_words = query.replace(" ", "").lower()
31
  relevant_sentences = []
32
  for s in knowledge_list:
 
42
  context = find_relevant_context(query)
43
  persona_guide = (
44
  "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
45
+ "๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์œผ๋กœ 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
46
  "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
47
  )
48
  info = context if context else "์ •๋ณด ์—†์Œ"
49
  prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
50
 
51
+ # ์ด์ „ ๋‹ต๋ณ€ ๋กœ์ง ๊ฐœ์„  (attention_mask ์ถ”๊ฐ€) - ์ด ๋ถ€๋ถ„์€ ์ž˜ ์ž‘๋™ํ•˜๊ณ  ์žˆ์„ ๊ฑฐ์•ผ!
52
+ encoded_input = tokenizer.encode_plus(
53
+ prompt,
54
+ return_tensors='pt',
55
+ truncation=True,
56
+ padding=True
57
+ )
58
+ input_ids = encoded_input['input_ids']
59
+ attention_mask = encoded_input['attention_mask']
60
+
61
  model.eval()
62
  with torch.no_grad():
63
  gen_ids = model.generate(
64
  input_ids,
65
+ attention_mask=attention_mask,
66
+ max_new_tokens=80, # ๋‹ต๋ณ€์ด ์ž˜๋ฆฌ๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์กฐ๊ธˆ ๋Š˜๋ ค๋ดค์–ด! (60 -> 80)
67
  min_length=5,
68
  repetition_penalty=1.3,
69
  do_sample=True,
 
73
  temperature=0.5,
74
  num_beams=1
75
  )
76
+ raw_response = tokenizer.decode(gen_ids[0], skip_special_tokens=True) # ์›๋ณธ ์‘๋‹ต ์ €์žฅ
77
+
78
+ # --- ์‘๋‹ต ์ฒ˜๋ฆฌ ๋กœ์ง ๊ฐœ์„  ๋ฒ„์ „ (index out of range ์—๋Ÿฌ ๋ฐฉ์ง€) ---
79
+ # 1. ๋ชจ๋ธ์ด ์ƒ์„ฑํ•œ ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ํ”„๋กฌํ”„ํŠธ ๋ถ€๋ถ„ ์ž๋ฅด๊ธฐ (๋ฐ˜๋ณต๋˜๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€)
80
+ # prompt๊ฐ€ raw_response์˜ ์‹œ์ž‘ ๋ถ€๋ถ„์— ์žˆ๋‹ค๋ฉด ๊ทธ ๋ถ€๋ถ„์„ ์ž˜๋ผ๋‚ผ๊ฒŒ.
81
+ if raw_response.startswith(prompt):
82
+ extracted_answer = raw_response[len(prompt):].strip()
83
+ else:
84
+ extracted_answer = raw_response.strip()
85
+
86
+ # 2. '๋‹ต๋ณ€:' ํ‚ค์›Œ๋“œ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ง„์งœ ๋‹ต๋ณ€ ๋ถ€๋ถ„ ์ถ”์ถœ
87
+ if "๋‹ต๋ณ€:" in extracted_answer:
88
+ answer = extracted_answer.split("๋‹ต๋ณ€:", 1)[1].strip() # ์ฒซ ๋ฒˆ์งธ "๋‹ต๋ณ€:" ์ดํ›„๋งŒ
89
+ else:
90
+ # ๋งŒ์•ฝ "๋‹ต๋ณ€:" ํƒœ๊ทธ๊ฐ€ ์—†์œผ๋ฉด, ํ”„๋กฌํ”„ํŠธ์˜ ์ง€์‹œ์‚ฌํ•ญ ์ค‘๋ณต ๋“ฑ์„ ์ œ๊ฑฐ ์‹œ๋„
91
+ persona_end_marker = "๋‹ตํ•ด.\n" # persona_guide์˜ ํŠน์ • ๋ ๋ถ€๋ถ„์„ ํ‘œ์‹œ
92
+ if persona_end_marker in extracted_answer:
93
+ try:
94
+ answer = extracted_answer[extracted_answer.rindex(persona_end_marker) + len(persona_end_marker):].strip()
95
+ except ValueError:
96
+ answer = extracted_answer # ์•ˆ๋˜๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
97
+ else:
98
+ answer = extracted_answer # ๊ทธ๊ฒƒ๋„ ์—†์œผ๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
99
 
100
+ # ๊ทธ๋ž˜๋„ ๋‹ต๋ณ€์ด ๋น„์–ด์žˆ์œผ๋ฉด ์˜ค๋ฅ˜ ๋ฉ”์‹œ์ง€๋ฅผ ๋Œ€์ฒด
101
+ if not answer:
102
+ answer = "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†๊ฑฐ๋‚˜ ๋‚ด์šฉ์ด ๋ช…ํ™•ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."
103
+
104
+
105
+ # 1. ์˜๋ฏธ ์—†๋Š” ์ˆ˜์‹/์˜๋ฌธ/ํŠน์ˆ˜๋ฌธ์ž/๋ฐ˜๋ณต๋ฌธ์ž ๋“ฑ ํ•„ํ„ฐ๋ง (๊ธฐ์กด๊ณผ ๋™์ผ)
106
+ # ์ด ๋ถ€๋ถ„์„ ๋จผ์ € ํ•œ๋ฒˆ ์ ์šฉํ•ด์„œ answer๊ฐ€ ์—‰๋šฑํ•œ ๋ฌธ์ž์—ด์ด ๋˜๋Š” ๊ฑธ ๋ฐฉ์ง€
107
  answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
108
  answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
109
  answer = re.sub(r"[a-zA-Z]+", "", answer)
110
  answer = re.sub(r"[=^*/\\]+", "", answer)
111
  answer = re.sub(r"\s+", " ", answer).strip()
112
 
113
+ # 2. 80์ž ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ (ํ•œ๊ธ€ ๊ธฐ์ค€) (๊ธฐ์กด๊ณผ ๋™์ผ)
114
  def truncate_korean(text, max_len=80):
115
  count = 0
116
  result = ""
 
122
  return result
123
  answer = truncate_korean(answer, 80)
124
 
125
+ # 3. ๋ฌธ์žฅ ๋์ด ์ž์—ฐ์Šค๋Ÿฝ์ง€ ์•Š์œผ๋ฉด ๋งˆ์นจํ‘œ ์ถ”๊ฐ€
126
+ if answer and answer[-1] not in ".!?":
127
  answer += "."
128
+ elif not answer: # ๋นˆ ๋ฌธ์ž์—ด์ธ๋ฐ '.' ์ฐ์œผ๋ฉด ์—๋Ÿฌ๋‚˜๋‹ˆ ํ•œ๋ฒˆ ๋” ์ฒดํฌ
129
+ answer = "์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค." # ์ตœํ›„์˜ ๋ณด๋ฃจ
130
+
131
  return answer
132
  except Exception as e:
133
  print(f"ask_sayknow ์—๋Ÿฌ: {e}")
134
  traceback.print_exc()
135
+ return f"๋‚ด๋ถ€ ์˜ค๋ฅ˜: {str(e)}" # ์™ธ๋ถ€ ์‚ฌ์šฉ์ž์—๊ฒŒ ๋ณด์ด๋Š” ๋ฉ”์‹œ์ง€!
136
 
137
+ # 3. API (XML ์‘๋‹ต ์œ ์ง€) (๊ธฐ์กด๊ณผ ๋™์ผ)
138
  @app.route('/chatapi.html', methods=['GET'])
139
  @app.route('/index.html', methods=['GET'])
140
  def chat_api():
 
162
  xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
163
  return Response(xml_output, mimetype='text/xml')
164
 
165
+ # 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€) - hCaptcha ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ!
166
  @app.route('/', methods=['GET', 'POST'])
167
  def index():
168
  answer = ""
169
  question = ""
170
+ # error_message ์ œ๊ฑฐ
171
+
172
  if request.method == "POST":
173
  question = request.form.get('question', '')
174
+ # hcaptcha_response ๊ด€๋ จ ๋กœ์ง ์ œ๊ฑฐ
175
+
176
+ # hCaptcha ๊ฒ€์ฆ ๋กœ์ง ์ œ๊ฑฐ
177
+ if question: # ์งˆ๋ฌธ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ!
178
  answer = ask_sayknow(question)
179
 
180
  html = f"""
181
  <html>
182
  <head>
183
  <title>Sayknow ์ฑ—๋ด‡</title>
184
+ <!-- hCaptcha ์Šคํฌ๋ฆฝํŠธ ์ œ๊ฑฐ -->
185
  </head>
186
  <body>
187
  <h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
188
  <form method="post" action="/">
189
  <input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
190
+ <br/><br/>
191
+ <!-- hCaptcha ์œ„์ ฏ ์ œ๊ฑฐ -->
192
+ <!-- ์—๋Ÿฌ ๋ฉ”์‹œ์ง€ ๋ณด์—ฌ์ฃผ๋Š” ๋ถ€๋ถ„ ์ œ๊ฑฐ -->
193
+ <br/>
194
  <input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
195
  </form>
196
  <hr>