SayknowLab commited on
Commit
cf1cbb5
ยท
verified ยท
1 Parent(s): 61f7834

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -206
app.py DELETED
@@ -1,206 +0,0 @@
1
- import pandas as pd
2
- import torch
3
- from flask import Flask, request, Response, render_template_string
4
- from transformers import AutoTokenizer, GPT2LMHeadModel
5
- from dicttoxml import dicttoxml
6
- import re
7
- import traceback
8
-
9
- app = Flask(__name__)
10
-
11
- # --- hCaptcha ์„ค์ • ๊ด€๋ จ ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ๋จ ---
12
-
13
- # 1. ๋ชจ๋ธ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
14
- print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘...")
15
- tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
16
- print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
17
- model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
18
- print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
19
-
20
- # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
21
- try:
22
- df = pd.read_excel('dataset.xlsx')
23
- knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist()
24
- except Exception as e:
25
- print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
26
- knowledge_list = []
27
-
28
- def find_relevant_context(query, top_n=2):
29
- """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜ (๊ธฐ์กด๊ณผ ๋™์ผ)"""
30
- query_words = query.replace(" ", "").lower()
31
- relevant_sentences = []
32
- for s in knowledge_list:
33
- s_text = str(s).replace(" ", "").replace("\n", "").lower()
34
- if any(word.replace(" ", "") in s_text for word in query.split()):
35
- relevant_sentences.append(s)
36
- if relevant_sentences:
37
- return " ".join(str(s) for s in relevant_sentences[:top_n])
38
- return ""
39
-
40
- def ask_sayknow(query):
41
- try:
42
- context = find_relevant_context(query)
43
- persona_guide = (
44
- "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
45
- "๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์œผ๋กœ 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
46
- "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
47
- )
48
- info = context if context else "์ •๋ณด ์—†์Œ"
49
- prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
50
-
51
- # ์ด์ „ ๋‹ต๋ณ€ ๋กœ์ง ๊ฐœ์„  (attention_mask ์ถ”๊ฐ€) - ์ด ๋ถ€๋ถ„์€ ์ž˜ ์ž‘๋™ํ•˜๊ณ  ์žˆ์„ ๊ฑฐ์•ผ!
52
- tokenizer.pad_token = tokenizer.eos_token
53
- encoded_input = tokenizer.encode_plus(
54
- prompt,
55
- return_tensors='pt',
56
- truncation=True,
57
- padding=True
58
- )
59
- input_ids = encoded_input['input_ids']
60
- attention_mask = encoded_input['attention_mask']
61
-
62
- model.eval()
63
- with torch.no_grad():
64
- gen_ids = model.generate(
65
- input_ids,
66
- attention_mask=attention_mask,
67
- max_new_tokens=512, # ๋‹ต๋ณ€์ด ์ž˜๋ฆฌ๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์กฐ๊ธˆ ๋Š˜๋ ค๋ดค์–ด! (60 -> 80)
68
- min_length=5,
69
- repetition_penalty=1.3,
70
- do_sample=True,
71
- top_k=30,
72
- top_p=0.85,
73
- pad_token_id=tokenizer.pad_token_id,
74
- temperature=0.5,
75
- num_beams=1
76
- )
77
- raw_response = tokenizer.decode(gen_ids[0], skip_special_tokens=True) # ์›๋ณธ ์‘๋‹ต ์ €์žฅ
78
-
79
- # --- ์‘๋‹ต ์ฒ˜๋ฆฌ ๋กœ์ง ๊ฐœ์„  ๋ฒ„์ „ (index out of range ์—๋Ÿฌ ๋ฐฉ์ง€) ---
80
- # 1. ๋ชจ๋ธ์ด ์ƒ์„ฑํ•œ ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ํ”„๋กฌํ”„ํŠธ ๋ถ€๋ถ„ ์ž๋ฅด๊ธฐ (๋ฐ˜๋ณต๋˜๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€)
81
- # prompt๊ฐ€ raw_response์˜ ์‹œ์ž‘ ๋ถ€๋ถ„์— ์žˆ๋‹ค๋ฉด ๊ทธ ๋ถ€๋ถ„์„ ์ž˜๋ผ๋‚ผ๊ฒŒ.
82
- if raw_response.startswith(prompt):
83
- extracted_answer = raw_response[len(prompt):].strip()
84
- else:
85
- extracted_answer = raw_response.strip()
86
-
87
- # 2. '๋‹ต๋ณ€:' ํ‚ค์›Œ๋“œ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ง„์งœ ๋‹ต๋ณ€ ๋ถ€๋ถ„ ์ถ”์ถœ
88
- if "๋‹ต๋ณ€:" in extracted_answer:
89
- answer = extracted_answer.split("๋‹ต๋ณ€:", 1)[1].strip() # ์ฒซ ๋ฒˆ์งธ "๋‹ต๋ณ€:" ์ดํ›„๋งŒ
90
- else:
91
- # ๋งŒ์•ฝ "๋‹ต๋ณ€:" ํƒœ๊ทธ๊ฐ€ ์—†์œผ๋ฉด, ํ”„๋กฌํ”„ํŠธ์˜ ์ง€์‹œ์‚ฌํ•ญ ์ค‘๋ณต ๋“ฑ์„ ์ œ๊ฑฐ ์‹œ๋„
92
- persona_end_marker = "๋‹ตํ•ด.\n" # persona_guide์˜ ํŠน์ • ๋ ๋ถ€๋ถ„์„ ํ‘œ์‹œ
93
- if persona_end_marker in extracted_answer:
94
- try:
95
- answer = extracted_answer[extracted_answer.rindex(persona_end_marker) + len(persona_end_marker):].strip()
96
- except ValueError:
97
- answer = extracted_answer # ์•ˆ๋˜๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
98
- else:
99
- answer = extracted_answer # ๊ทธ๊ฒƒ๋„ ์—†์œผ๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
100
-
101
- # ๊ทธ๋ž˜๋„ ๋‹ต๋ณ€์ด ๋น„์–ด์žˆ์œผ๋ฉด ์˜ค๋ฅ˜ ๋ฉ”์‹œ์ง€๋ฅผ ๋Œ€์ฒด
102
- if not answer:
103
- answer = "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†๊ฑฐ๋‚˜ ๋‚ด์šฉ์ด ๋ช…ํ™•ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."
104
-
105
-
106
- # 1. ์˜๋ฏธ ์—†๋Š” ์ˆ˜์‹/์˜๋ฌธ/ํŠน์ˆ˜๋ฌธ์ž/๋ฐ˜๋ณต๋ฌธ์ž ๋“ฑ ํ•„ํ„ฐ๋ง (๊ธฐ์กด๊ณผ ๋™์ผ)
107
- # ์ด ๋ถ€๋ถ„์„ ๋จผ์ € ํ•œ๋ฒˆ ์ ์šฉํ•ด์„œ answer๊ฐ€ ์—‰๋šฑํ•œ ๋ฌธ์ž์—ด์ด ๋˜๋Š” ๊ฑธ ๋ฐฉ์ง€
108
- answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
109
- answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
110
- answer = re.sub(r"[a-zA-Z]+", "", answer)
111
- answer = re.sub(r"[=^*/\\]+", "", answer)
112
- answer = re.sub(r"\s+", " ", answer).strip()
113
-
114
- # 2. 80์ž ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ (ํ•œ๊ธ€ ๊ธฐ์ค€) (๊ธฐ์กด๊ณผ ๋™์ผ)
115
- def truncate_korean(text, max_len=80):
116
- count = 0
117
- result = ""
118
- for ch in text:
119
- result += ch
120
- count += 1
121
- if count >= max_len:
122
- break
123
- return result
124
- answer = truncate_korean(answer, 80)
125
-
126
- # 3. ๋ฌธ์žฅ ๋์ด ์ž์—ฐ์Šค๋Ÿฝ์ง€ ์•Š์œผ๋ฉด ๋งˆ์นจํ‘œ ์ถ”๊ฐ€
127
- if answer and answer[-1] not in ".!?":
128
- answer += "."
129
- elif not answer: # ๋นˆ ๋ฌธ์ž์—ด์ธ๋ฐ '.' ์ฐ์œผ๋ฉด ์—๋Ÿฌ๋‚˜๋‹ˆ ํ•œ๋ฒˆ ๋” ์ฒดํฌ
130
- answer = "์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค." # ์ตœํ›„์˜ ๋ณด๋ฃจ
131
-
132
- return answer
133
- except Exception as e:
134
- print(f"ask_sayknow ์—๋Ÿฌ: {e}")
135
- traceback.print_exc()
136
- return f"๋‚ด๋ถ€ ์˜ค๋ฅ˜: {str(e)}" # ์™ธ๋ถ€ ์‚ฌ์šฉ์ž์—๊ฒŒ ๋ณด์ด๋Š” ๋ฉ”์‹œ์ง€!
137
-
138
- # 3. API (XML ์‘๋‹ต ์œ ์ง€) (๊ธฐ์กด๊ณผ ๋™์ผ)
139
- @app.route('/chatapi.html', methods=['GET'])
140
- @app.route('/index.html', methods=['GET'])
141
- def chat_api():
142
- query = request.args.get('askdata', '')
143
- if not query:
144
- result = {"status": "error", "message": "No data"}
145
- else:
146
- try:
147
- answer = ask_sayknow(query)
148
- result = {
149
- "service": "Sayknow",
150
- "question": query,
151
- "answer": answer
152
- }
153
- except Exception as e:
154
- print(f"chat_api ์—๋Ÿฌ: {e}")
155
- traceback.print_exc()
156
- result = {
157
- "service": "Sayknow",
158
- "question": query,
159
- "answer": f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}",
160
- "error": str(e)
161
- }
162
-
163
- xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
164
- return Response(xml_output, mimetype='text/xml')
165
-
166
- # 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€) - hCaptcha ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ!
167
- @app.route('/', methods=['GET', 'POST'])
168
- def index():
169
- answer = ""
170
- question = ""
171
- # error_message ์ œ๊ฑฐ
172
-
173
- if request.method == "POST":
174
- question = request.form.get('question', '')
175
- # hcaptcha_response ๊ด€๋ จ ๋กœ์ง ์ œ๊ฑฐ
176
-
177
- # hCaptcha ๊ฒ€์ฆ ๋กœ์ง ์ œ๊ฑฐ
178
- if question: # ์งˆ๋ฌธ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ!
179
- answer = ask_sayknow(question)
180
-
181
- html = f"""
182
- <html>
183
- <head>
184
- <title>Sayknow ์ฑ—๋ด‡</title>
185
- <!-- hCaptcha ์Šคํฌ๋ฆฝํŠธ ์ œ๊ฑฐ -->
186
- </head>
187
- <body>
188
- <h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
189
- <form method="post" action="/">
190
- <input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
191
- <br/><br/>
192
- <!-- hCaptcha ์œ„์ ฏ ์ œ๊ฑฐ -->
193
- <!-- ์—๋Ÿฌ ๋ฉ”์‹œ์ง€ ๋ณด์—ฌ์ฃผ๋Š” ๋ถ€๋ถ„ ์ œ๊ฑฐ -->
194
- <br/>
195
- <input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
196
- </form>
197
- <hr>
198
- <h3>๋‹ต๋ณ€:</h3>
199
- <p style="white-space: pre-wrap;">{answer}</p>
200
- </body>
201
- </html>
202
- """
203
- return render_template_string(html)
204
-
205
- if __name__ == '__main__':
206
- app.run(host='0.0.0.0', port=7860)