SayknowLab commited on
Commit
d2b1063
ยท
verified ยท
1 Parent(s): 86d27b0

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile.txt +15 -0
  2. app.py +206 -0
  3. requirements.txt +8 -0
Dockerfile.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile (์ด๊ฑฐ ๊ทธ๋Œ€๋กœ ๋ณต์‚ฌํ•ด์„œ Dockerfile ์ด๋ผ๋Š” ์ด๋ฆ„์œผ๋กœ ์ €์žฅ!)
2
+ FROM python:3.9-slim-buster
3
+
4
+ WORKDIR /app
5
+
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ COPY app.py .
10
+ COPY dataset.xlsx .
11
+
12
+ EXPOSE 7860
13
+
14
+ # ๋„ค Flask ์•ฑ์„ ์›น ์„œ๋ฒ„์ฒ˜๋Ÿผ ์‹คํ–‰์‹œ์ผœ์ฃผ๋Š” ๋ช…๋ น์–ด (Gunicorn์ด ๋„์™€์คŒ)
15
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ # import torch
3
+ from flask import Flask, request, Response, render_template_string
4
+ from transformers import AutoTokenizer, GPT2LMHeadModel
5
+ # from dicttoxml import dicttoxml
6
+ import re
7
+ import traceback
8
+
9
+ app = Flask(__name__)
10
+
11
+ # --- hCaptcha ์„ค์ • ๊ด€๋ จ ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ๋จ ---
12
+
13
+ # 1. ๋ชจ๋ธ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
14
+ print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘...")
15
+ tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
16
+ print("๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
17
+ model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2", trust_remote_code=True)
18
+ print("๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ!")
19
+
20
+ # 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ (๊ธฐ์กด๊ณผ ๋™์ผ)
21
+ try:
22
+ df = pd.read_excel('dataset.xlsx')
23
+ knowledge_list = df['๋ฐ์ดํ„ฐ์…‹์— ๋„ฃ์„ ๋‚ด์šฉ(*)'].tolist()
24
+ except Exception as e:
25
+ print(f"๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์—๋Ÿฌ: {e}")
26
+ knowledge_list = []
27
+
28
+ def find_relevant_context(query, top_n=2):
29
+ """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ง€์‹๋ฐ์ดํ„ฐ ๋ฌธ์žฅ ์ตœ๋Œ€ top_n๊ฐœ ์ฐพ์•„์„œ ๋ฐ˜ํ™˜ (๊ธฐ์กด๊ณผ ๋™์ผ)"""
30
+ query_words = query.replace(" ", "").lower()
31
+ relevant_sentences = []
32
+ for s in knowledge_list:
33
+ s_text = str(s).replace(" ", "").replace("\n", "").lower()
34
+ if any(word.replace(" ", "") in s_text for word in query.split()):
35
+ relevant_sentences.append(s)
36
+ if relevant_sentences:
37
+ return " ".join(str(s) for s in relevant_sentences[:top_n])
38
+ return ""
39
+
40
+ def ask_sayknow(query):
41
+ try:
42
+ context = find_relevant_context(query)
43
+ persona_guide = (
44
+ "๋„ˆ๋Š” ์ง€์‹ ๊ธฐ๋ฐ˜ ํ•œ๊ตญ์–ด ์ฑ—๋ด‡ Sayknow์•ผ. ์ž๊ธฐ์†Œ๊ฐœ(์ด๋ฆ„, ์ •์ฒด, ์ธ์‚ฌ ๋“ฑ) ์งˆ๋ฌธ์€ '์ €๋Š” Sayknow์ž…๋‹ˆ๋‹ค.'๋ผ๊ณ  ๋‹ตํ•ด. "
45
+ "๊ทธ ์™ธ์—” ์•„๋ž˜ ์ฐธ๊ณ ํ•ด์„œ ์ •ํ™•ํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์œผ๋กœ 80์ž ์ด๋‚ด๋กœ ๋‹ตํ•ด.\n"
46
+ "์˜ˆ์‹œ: Q: ๋ถ„์ˆ˜์˜ ๋ง์…ˆ์ด ๋ญ์•ผ?\nA: ๋ถ„๋ชจ๊ฐ€ ๊ฐ™์„ ๋•Œ ๋ถ„์ž๋ผ๋ฆฌ ๋”ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n"
47
+ )
48
+ info = context if context else "์ •๋ณด ์—†์Œ"
49
+ prompt = f"{persona_guide}---\n[์ •๋ณด]\n{info}\n[์งˆ๋ฌธ]\n{query}\n[๋‹ต๋ณ€] "
50
+
51
+ # ์ด์ „ ๋‹ต๋ณ€ ๋กœ์ง ๊ฐœ์„  (attention_mask ์ถ”๊ฐ€) - ์ด ๋ถ€๋ถ„์€ ์ž˜ ์ž‘๋™ํ•˜๊ณ  ์žˆ์„ ๊ฑฐ์•ผ!
52
+ tokenizer.pad_token = tokenizer.eos_token
53
+ encoded_input = tokenizer.encode_plus(
54
+ prompt,
55
+ return_tensors='pt',
56
+ truncation=True,
57
+ padding=True
58
+ )
59
+ input_ids = encoded_input['input_ids']
60
+ attention_mask = encoded_input['attention_mask']
61
+
62
+ model.eval()
63
+ with torch.no_grad():
64
+ gen_ids = model.generate(
65
+ input_ids,
66
+ attention_mask=attention_mask,
67
+ max_new_tokens=512, # ๋‹ต๋ณ€์ด ์ž˜๋ฆฌ๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์กฐ๊ธˆ ๋Š˜๋ ค๋ดค์–ด! (60 -> 80)
68
+ min_length=5,
69
+ repetition_penalty=1.3,
70
+ do_sample=True,
71
+ top_k=30,
72
+ top_p=0.85,
73
+ pad_token_id=tokenizer.pad_token_id,
74
+ temperature=0.5,
75
+ num_beams=1
76
+ )
77
+ raw_response = tokenizer.decode(gen_ids[0], skip_special_tokens=True) # ์›๋ณธ ์‘๋‹ต ์ €์žฅ
78
+
79
+ # --- ์‘๋‹ต ์ฒ˜๋ฆฌ ๋กœ์ง ๊ฐœ์„  ๋ฒ„์ „ (index out of range ์—๋Ÿฌ ๋ฐฉ์ง€) ---
80
+ # 1. ๋ชจ๋ธ์ด ์ƒ์„ฑํ•œ ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ํ”„๋กฌํ”„ํŠธ ๋ถ€๋ถ„ ์ž๋ฅด๊ธฐ (๋ฐ˜๋ณต๋˜๋Š” ๋ฌธ์ œ ๋ฐฉ์ง€)
81
+ # prompt๊ฐ€ raw_response์˜ ์‹œ์ž‘ ๋ถ€๋ถ„์— ์žˆ๋‹ค๋ฉด ๊ทธ ๋ถ€๋ถ„์„ ์ž˜๋ผ๋‚ผ๊ฒŒ.
82
+ if raw_response.startswith(prompt):
83
+ extracted_answer = raw_response[len(prompt):].strip()
84
+ else:
85
+ extracted_answer = raw_response.strip()
86
+
87
+ # 2. '๋‹ต๋ณ€:' ํ‚ค์›Œ๋“œ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ง„์งœ ๋‹ต๋ณ€ ๋ถ€๋ถ„ ์ถ”์ถœ
88
+ if "๋‹ต๋ณ€:" in extracted_answer:
89
+ answer = extracted_answer.split("๋‹ต๋ณ€:", 1)[1].strip() # ์ฒซ ๋ฒˆ์งธ "๋‹ต๋ณ€:" ์ดํ›„๋งŒ
90
+ else:
91
+ # ๋งŒ์•ฝ "๋‹ต๋ณ€:" ํƒœ๊ทธ๊ฐ€ ์—†์œผ๋ฉด, ํ”„๋กฌํ”„ํŠธ์˜ ์ง€์‹œ์‚ฌํ•ญ ์ค‘๋ณต ๋“ฑ์„ ์ œ๊ฑฐ ์‹œ๋„
92
+ persona_end_marker = "๋‹ตํ•ด.\n" # persona_guide์˜ ํŠน์ • ๋ ๋ถ€๋ถ„์„ ํ‘œ์‹œ
93
+ if persona_end_marker in extracted_answer:
94
+ try:
95
+ answer = extracted_answer[extracted_answer.rindex(persona_end_marker) + len(persona_end_marker):].strip()
96
+ except ValueError:
97
+ answer = extracted_answer # ์•ˆ๋˜๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
98
+ else:
99
+ answer = extracted_answer # ๊ทธ๊ฒƒ๋„ ์—†์œผ๋ฉด ๊ทธ๋ƒฅ ์ „์ฒด ์‚ฌ์šฉ
100
+
101
+ # ๊ทธ๋ž˜๋„ ๋‹ต๋ณ€์ด ๋น„์–ด์žˆ์œผ๋ฉด ์˜ค๋ฅ˜ ๋ฉ”์‹œ์ง€๋ฅผ ๋Œ€์ฒด
102
+ if not answer:
103
+ answer = "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†๊ฑฐ๋‚˜ ๋‚ด์šฉ์ด ๋ช…ํ™•ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."
104
+
105
+
106
+ # 1. ์˜๋ฏธ ์—†๋Š” ์ˆ˜์‹/์˜๋ฌธ/ํŠน์ˆ˜๋ฌธ์ž/๋ฐ˜๋ณต๋ฌธ์ž ๋“ฑ ํ•„ํ„ฐ๋ง (๊ธฐ์กด๊ณผ ๋™์ผ)
107
+ # ์ด ๋ถ€๋ถ„์„ ๋จผ์ € ํ•œ๋ฒˆ ์ ์šฉํ•ด์„œ answer๊ฐ€ ์—‰๋šฑํ•œ ๋ฌธ์ž์—ด์ด ๋˜๋Š” ๊ฑธ ๋ฐฉ์ง€
108
+ answer = re.sub(r"[^๊ฐ€-ํžฃ0-9 .,!?~\n]", "", answer)
109
+ answer = re.sub(r"([.,!?~])\1{2,}", r"\1", answer)
110
+ answer = re.sub(r"[a-zA-Z]+", "", answer)
111
+ answer = re.sub(r"[=^*/\\]+", "", answer)
112
+ answer = re.sub(r"\s+", " ", answer).strip()
113
+
114
+ # 2. 80์ž ์ด๋‚ด๋กœ ์ž๋ฅด๊ธฐ (ํ•œ๊ธ€ ๊ธฐ์ค€) (๊ธฐ์กด๊ณผ ๋™์ผ)
115
+ def truncate_korean(text, max_len=80):
116
+ count = 0
117
+ result = ""
118
+ for ch in text:
119
+ result += ch
120
+ count += 1
121
+ if count >= max_len:
122
+ break
123
+ return result
124
+ answer = truncate_korean(answer, 80)
125
+
126
+ # 3. ๋ฌธ์žฅ ๋์ด ์ž์—ฐ์Šค๋Ÿฝ์ง€ ์•Š์œผ๋ฉด ๋งˆ์นจํ‘œ ์ถ”๊ฐ€
127
+ if answer and answer[-1] not in ".!?":
128
+ answer += "."
129
+ elif not answer: # ๋นˆ ๋ฌธ์ž์—ด์ธ๋ฐ '.' ์ฐ์œผ๋ฉด ์—๋Ÿฌ๋‚˜๋‹ˆ ํ•œ๋ฒˆ ๋” ์ฒดํฌ
130
+ answer = "์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค." # ์ตœํ›„์˜ ๋ณด๋ฃจ
131
+
132
+ return answer
133
+ except Exception as e:
134
+ print(f"ask_sayknow ์—๋Ÿฌ: {e}")
135
+ traceback.print_exc()
136
+ return f"๋‚ด๋ถ€ ์˜ค๋ฅ˜: {str(e)}" # ์™ธ๋ถ€ ์‚ฌ์šฉ์ž์—๊ฒŒ ๋ณด์ด๋Š” ๋ฉ”์‹œ์ง€!
137
+
138
+ # 3. API (XML ์‘๋‹ต ์œ ์ง€) (๊ธฐ์กด๊ณผ ๋™์ผ)
139
+ @app.route('/chatapi.html', methods=['GET'])
140
+ @app.route('/index.html', methods=['GET'])
141
+ def chat_api():
142
+ query = request.args.get('askdata', '')
143
+ if not query:
144
+ result = {"status": "error", "message": "No data"}
145
+ else:
146
+ try:
147
+ answer = ask_sayknow(query)
148
+ result = {
149
+ "service": "Sayknow",
150
+ "question": query,
151
+ "answer": answer
152
+ }
153
+ except Exception as e:
154
+ print(f"chat_api ์—๋Ÿฌ: {e}")
155
+ traceback.print_exc()
156
+ result = {
157
+ "service": "Sayknow",
158
+ "question": query,
159
+ "answer": f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}",
160
+ "error": str(e)
161
+ }
162
+
163
+ xml_output = dicttoxml(result, custom_root='SayknowAPI', attr_type=False)
164
+ return Response(xml_output, mimetype='text/xml')
165
+
166
+ # 4. ์›น UI (๊ฐ„๋‹จํ•œ ์งˆ๋ฌธ ํผ + ๋‹ต๋ณ€) - hCaptcha ์ฝ”๋“œ ์ „๋ถ€ ์ œ๊ฑฐ!
167
+ @app.route('/', methods=['GET', 'POST'])
168
+ def index():
169
+ answer = ""
170
+ question = ""
171
+ # error_message ์ œ๊ฑฐ
172
+
173
+ if request.method == "POST":
174
+ question = request.form.get('question', '')
175
+ # hcaptcha_response ๊ด€๋ จ ๋กœ์ง ์ œ๊ฑฐ
176
+
177
+ # hCaptcha ๊ฒ€์ฆ ๋กœ์ง ์ œ๊ฑฐ
178
+ if question: # ์งˆ๋ฌธ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ!
179
+ answer = ask_sayknow(question)
180
+
181
+ html = f"""
182
+ <html>
183
+ <head>
184
+ <title>Sayknow ์ฑ—๋ด‡</title>
185
+ <!-- hCaptcha ์Šคํฌ๋ฆฝํŠธ ์ œ๊ฑฐ -->
186
+ </head>
187
+ <body>
188
+ <h2>Sayknow ํ•œ๊ตญ์–ด ์ฑ—๋ด‡</h2>
189
+ <form method="post" action="/">
190
+ <input type="text" name="question" value="{question}" placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”" style="width:300px;" autofocus />
191
+ <br/><br/>
192
+ <!-- hCaptcha ์œ„์ ฏ ์ œ๊ฑฐ -->
193
+ <!-- ์—๋Ÿฌ ๋ฉ”์‹œ์ง€ ๋ณด์—ฌ์ฃผ๋Š” ๋ถ€๋ถ„ ์ œ๊ฑฐ -->
194
+ <br/>
195
+ <input type="submit" value="์งˆ๋ฌธํ•˜๊ธฐ" />
196
+ </form>
197
+ <hr>
198
+ <h3>๋‹ต๋ณ€:</h3>
199
+ <p style="white-space: pre-wrap;">{answer}</p>
200
+ </body>
201
+ </html>
202
+ """
203
+ return render_template_string(html)
204
+
205
+ if __name__ == '__main__':
206
+ app.run(host='0.0.0.0', port=7860)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ torch
3
+ flask
4
+ transformers==4.35.2
5
+ dicttoxml
6
+ gunicorn
7
+ openpyxl
8
+ huggingface_hub==0.17.3