SayknowLab commited on
Commit
87409f3
ยท
verified ยท
1 Parent(s): 03b24e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -37
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import pandas as pd
2
  import torch
3
- from flask import Flask, request, Response
4
  from transformers import AutoTokenizer, GPT2LMHeadModel
5
  from dicttoxml import dicttoxml
 
 
6
 
7
  app = Flask(__name__)
8
 
@@ -13,26 +15,22 @@ print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
13
  model = GPT2LMHeadModel.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
14
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
15
 
16
- # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (์—‘์…€์˜ ํ•œ ์ปฌ๋Ÿผ์— ์ง€์‹ ๋ฐ์ดํ„ฐ๊ฐ€ ๋ชฐ๋ ค์žˆ๋Š” ๊ฒฝ์šฐ)
17
  try:
18
  df = pd.read_excel('dataset.xlsx')
19
- knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist() # ๋ฐ์ดํ„ฐ์…‹ ๋ฌธ์žฅ ๋ฆฌ์ŠคํŠธํ™”
20
  except Exception as e:
21
  print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
22
  knowledge_list = []
23
 
24
  def find_relevant_context(query, top_n=2):
25
- """๋ฐ์ดํ„ฐ์…‹์—์„œ ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ๋ฌธ์žฅ์„ ์ฐพ์•„ ๋ฐ˜ํ™˜"""
26
- # ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ํ•„ํ„ฐ๋ง - ๊ณต๋ฐฑ ์ œ๊ฑฐํ•ด์„œ ๊ฒ€์ƒ‰
27
  query_words = query.replace(" ", "").lower()
28
  relevant_sentences = []
29
-
30
  for s in knowledge_list:
31
  s_text = str(s).replace(" ", "").replace("\n", "").lower()
32
- # ํ‚ค์›Œ๋“œ๊ฐ€ ๋ฌธ์žฅ์— ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
33
  if any(word.replace(" ", "") in s_text for word in query.split()):
34
  relevant_sentences.append(s)
35
-
36
  if relevant_sentences:
37
  return " ".join(str(s) for s in relevant_sentences[:top_n])
38
  return ""
@@ -40,18 +38,13 @@ def find_relevant_context(query, top_n=2):
40
  def ask_sayknow(query):
41
  try:
42
  context = find_relevant_context(query)
43
-
44
- # ์ž๊ธฐ์†Œ๊ฐœ/์ธ์‚ฌ ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋ช…์‹œ์  ์•ˆ๋‚ด ์ถ”๊ฐ€
45
- # ์—ญํ• , ์˜ˆ์‹œ, ์ •๋ณด/์งˆ๋ฌธ ๊ตฌ๋ถ„, context ์—†์„ ๋•Œ '์ •๋ณด ์—†์Œ' ๋ช…์‹œ
46
  persona_guide = (
47
- "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์—” '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
48
- "๊ทธ ์™ธ์—๋Š” ์•„๋ž˜ ์ •๋ณด๋ฅผ ์ฐธ๊ณ ํ•ด ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์œผ๋กœ 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
49
  "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
50
  )
51
  info = context if context else "์ •๋ณด ์—†์Œ"
52
- prompt = (
53
- f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
54
- )
55
 
56
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
57
  model.eval()
@@ -69,30 +62,21 @@ def ask_sayknow(query):
69
  num_beams=1
70
  )
71
  response = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
72
- # ๋‹ต๋ณ€ ๋ถ€๋ถ„๋งŒ ์ž˜๋ผ๋‚ด๊ธฐ
73
- if "๋‹ต๋ณ€:" in response:
74
- answer = response.split("๋‹ต๋ณ€:")[-1].strip()
75
- else:
76
- answer = response.strip()
77
 
78
- # 1. ์˜๋ฏธ ์—†๋Š” ์ˆ˜์‹/์˜๋ฌธ/ํŠน์ˆ˜๋ฌธ์ž/๋ฐ˜๋ณต๋ฌธ์ž ๋“ฑ ํ•„ํ„ฐ๋ง
79
- import re
80
- # ํ•œ๊ธ€, ์ˆซ์ž, ๊ธฐ๋ณธ ๊ตฌ๋‘์ ๋งŒ ํ—ˆ์šฉ
81
  answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
82
- # ๋ฐ˜๋ณต๋˜๋Š” ํŠน์ˆ˜๋ฌธ์ž, ์ˆซ์ž, ์˜๋ฌธ ์ œ๊ฑฐ
83
  answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
84
  answer = re.sub(r"[a-zA-Z]+", "", answer)
85
- # ์ˆ˜์‹(=, ^, *, / ๋“ฑ) ์ œ๊ฑฐ
86
  answer = re.sub(r"[=^*/\\]+", "", answer)
87
- # ์—ฐ์† ๊ณต๋ฐฑ ์ •๋ฆฌ
88
  answer = re.sub(r"\s+", " ", answer).strip()
89
 
90
- # 2. 80์ž ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ (ํ•œ๊ธ€ ๊ธฐ์ค€)
91
  def truncate_korean(text, max_len=80):
92
  count = 0
93
  result = ""
94
  for ch in text:
95
- # ํ•œ๊ธ€, ํ•œ์ž, ์˜๋ฌธ, ์ˆซ์ž, ๊ตฌ๋‘์  ๋ชจ๋‘ 1์ž๋กœ ์ทจ๊ธ‰
96
  result += ch
97
  count += 1
98
  if count >= max_len:
@@ -100,23 +84,20 @@ def ask_sayknow(query):
100
  return result
101
  answer = truncate_korean(answer, 80)
102
 
103
- # 3. ๋ฌธ์žฅ ๋์ด ์ž์—ฐ์Šค๋Ÿฝ์ง€ ์•Š์œผ๋ฉด ๋งˆ์นจํ‘œ ์ถ”๊ฐ€
104
  if answer and answer[-1] not in ".!?":
105
  answer += "."
106
  return answer
107
  except Exception as e:
108
  print(f"ask_sayknow ์—๋Ÿฌ: {e}")
109
- import traceback
110
  traceback.print_exc()
111
  return f"์˜ค๋ฅ˜: {str(e)}"
112
 
113
- # 3. REST API ์—”๋“œํฌ์ธํŠธ
114
  @app.route('/chatapi.html', methods=['GET'])
115
  @app.route('/index.html', methods=['GET'])
116
- @app.route('/', methods=['GET'])
117
  def chat_api():
118
  query = request.args.get('askdata', '')
119
-
120
  if not query:
121
  result = {"status": "error", "message": "No data"}
122
  else:
@@ -129,7 +110,6 @@ def chat_api():
129
  }
130
  except Exception as e:
131
  print(f"chat_api ์—๋Ÿฌ: {e}")
132
- import traceback
133
  traceback.print_exc()
134
  result = {
135
  "service": "Sayknow",
@@ -138,9 +118,37 @@ def chat_api():
138
  "error": str(e)
139
  }
140
 
141
- # XML ๋ณ€ํ™˜
142
  xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
143
  return Response(xml_output, mimetype='text/xml')
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  if __name__ == '__main__':
146
- app.run(host='0.0.0.0', port=7860)
 
1
  import pandas as pd
2
  import torch
3
+ from flask import Flask, request, Response, render_template_string
4
  from transformers import AutoTokenizer, GPT2LMHeadModel
5
  from dicttoxml import dicttoxml
6
+ import re
7
+ import traceback
8
 
9
  app = Flask(__name__)
10
 
 
15
  model = GPT2LMHeadModel.from_pretrained("EleutherAI/polyglot-ko-1.3b", trust_remote_code=True)
16
  print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
17
 
18
+ # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (์—‘์…€ ํ•œ ์ปฌ๋Ÿผ์— ์ง€์‹ ๋ฐ์ดํ„ฐ ์žˆ์„ ๋•Œ)
19
  try:
20
  df = pd.read_excel('dataset.xlsx')
21
+ knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist() # ์ปฌ๋Ÿผ๋ช… ์ •ํ™•ํžˆ ๋งž์ถฐ์•ผ ํ•ด!
22
  except Exception as e:
23
  print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
24
  knowledge_list = []
25
 
26
  def find_relevant_context(query, top_n=2):
27
+ """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜"""
 
28
  query_words = query.replace(" ", "").lower()
29
  relevant_sentences = []
 
30
  for s in knowledge_list:
31
  s_text = str(s).replace(" ", "").replace("\n", "").lower()
 
32
  if any(word.replace(" ", "") in s_text for word in query.split()):
33
  relevant_sentences.append(s)
 
34
  if relevant_sentences:
35
  return " ".join(str(s) for s in relevant_sentences[:top_n])
36
  return ""
 
38
  def ask_sayknow(query):
39
  try:
40
  context = find_relevant_context(query)
 
 
 
41
  persona_guide = (
42
+ "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
43
+ "๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
44
  "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
45
  )
46
  info = context if context else "์ •๋ณด ์—†์Œ"
47
+ prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
 
 
48
 
49
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
50
  model.eval()
 
62
  num_beams=1
63
  )
64
  response = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
65
+ # ๋‹ต๋ณ€๋งŒ ์ถ”์ถœ
66
+ answer = response.split("๋‹ต๋ณ€:")[-1].strip() if "๋‹ต๋ณ€:" in response else response.strip()
 
 
 
67
 
68
+ # ์˜๋ฏธ ์—†๋Š” ๋ฌธ์ž ํ•„ํ„ฐ๋ง
 
 
69
  answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
 
70
  answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
71
  answer = re.sub(r"[a-zA-Z]+", "", answer)
 
72
  answer = re.sub(r"[=^*/\\]+", "", answer)
 
73
  answer = re.sub(r"\s+", " ", answer).strip()
74
 
75
+ # 80์ž ์ด๋‚ด ์ž๋ฅด๊ธฐ
76
  def truncate_korean(text, max_len=80):
77
  count = 0
78
  result = ""
79
  for ch in text:
 
80
  result += ch
81
  count += 1
82
  if count >= max_len:
 
84
  return result
85
  answer = truncate_korean(answer, 80)
86
 
87
+ # ๋ฌธ์žฅ ๋ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ
88
  if answer and answer[-1] not in ".!?":
89
  answer += "."
90
  return answer
91
  except Exception as e:
92
  print(f"ask_sayknow ์—๋Ÿฌ: {e}")
 
93
  traceback.print_exc()
94
  return f"์˜ค๋ฅ˜: {str(e)}"
95
 
96
+ # 3. API (XML ์‘๋‹ต ์œ ์ง€)
97
  @app.route('/chatapi.html', methods=['GET'])
98
  @app.route('/index.html', methods=['GET'])
 
99
  def chat_api():
100
  query = request.args.get('askdata', '')
 
101
  if not query:
102
  result = {"status": "error", "message": "No data"}
103
  else:
 
110
  }
111
  except Exception as e:
112
  print(f"chat_api ์—๋Ÿฌ: {e}")
 
113
  traceback.print_exc()
114
  result = {
115
  "service": "Sayknow",
 
118
  "error": str(e)
119
  }
120
 
 
121
  xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
122
  return Response(xml_output, mimetype='text/xml')
123
 
124
+ # 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€)
125
+ @app.route('/', methods=['GET', 'POST'])
126
+ def index():
127
+ answer = ""
128
+ question = ""
129
+ if request.method == "POST":
130
+ question = request.form.get('question', '')
131
+ if question:
132
+ answer = ask_sayknow(question)
133
+
134
+ html = f"""
135
+ <html>
136
+ <head>
137
+ <title>Sayknow ์ฑ—๋ด‡</title>
138
+ </head>
139
+ <body>
140
+ <h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
141
+ <form method="post" action="/">
142
+ <input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
143
+ <input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
144
+ </form>
145
+ <hr>
146
+ <h3>๋‹ต๋ณ€:</h3>
147
+ <p style="white-space: pre-wrap;">{answer}</p>
148
+ </body>
149
+ </html>
150
+ """
151
+ return render_template_string(html)
152
+
153
  if __name__ == '__main__':
154
+ app.run(host='0.0.0.0', port=7860)