dgmos commited on
Commit
2f674ad
ยท
1 Parent(s): 0950d65

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +56 -27
app.py CHANGED
@@ -1,7 +1,7 @@
1
 
2
  import os
3
  import pdfplumber
4
- from huggingface_hub import hf_hub_download, list_repo_files
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
7
  from langchain_community.vectorstores import FAISS
@@ -10,10 +10,9 @@ import gradio as gr
10
 
11
  # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
- raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. "
14
- "HF Space โ†’ Settings โ†’ Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
15
 
16
- # 2. LLM ๋ชจ๋ธ (Hugging Face Inference API)
17
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
18
  llm = HuggingFaceEndpoint(
19
  repo_id=repo_id,
@@ -22,12 +21,49 @@ llm = HuggingFaceEndpoint(
22
  task="text-generation"
23
  )
24
 
25
- # 3. ๋ฐ์ดํ„ฐ์…‹์—์„œ PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ ์ž๋™ ์ˆ˜์ง‘
26
  dataset_repo = "dgmos/ericsson-manuals"
27
- all_files = list_repo_files(dataset_repo)
28
- pdf_files = [f for f in all_files if f.lower().endswith(".pdf")]
29
-
30
- print(f"๐Ÿ“‚ ์ด {len(pdf_files)} ๊ฐœ PDF ๊ฐ์ง€๋จ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
33
  docs = []
@@ -44,26 +80,24 @@ for filename in pdf_files:
44
  ".join(texts).strip()
45
  if text:
46
  docs.append({"page_content": text, "metadata": {"source": filename}})
47
- print(f"โœ… {filename} ์ฒ˜๋ฆฌ ์™„๋ฃŒ (๊ธธ์ด: {len(text)}์ž)")
48
  else:
49
  print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {filename}")
50
  except Exception as e:
51
- print(f"๐Ÿšจ PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {filename} - {e}")
52
 
53
  if not docs:
54
- raise RuntimeError("โŒ PDF์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ํ™•์ธ ํ•„์š”!")
55
 
56
  # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
57
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
58
  texts = splitter.split_documents(docs)
59
 
60
- # 6. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐDB
61
- embeddings = HuggingFaceEmbeddings(
62
- model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
63
- )
64
  vectorstore = FAISS.from_documents(texts, embeddings)
65
 
66
- # 7. Retrieval QA ์ฒด์ธ
67
  qa_chain = RetrievalQA.from_chain_type(
68
  llm=llm,
69
  chain_type="stuff",
@@ -71,7 +105,7 @@ qa_chain = RetrievalQA.from_chain_type(
71
  )
72
 
73
  # 8. ์ฑ—๋ด‡ ํ•จ์ˆ˜
74
- def chatbot(query: str):
75
  try:
76
  response = qa_chain.run(query)
77
  return response
@@ -80,16 +114,11 @@ def chatbot(query: str):
80
 
81
  # 9. Gradio UI
82
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
83
- gr.Markdown("# ๐Ÿ“ก Ericsson LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
84
- gr.Markdown("Hugging Face Hub์— ์ €์žฅ๋œ **๋ชจ๋“  PDF**๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•ฉ๋‹ˆ๋‹ค.")
85
-
86
- query = gr.Textbox(
87
- label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)",
88
- placeholder="์˜ˆ: 5G ์ค‘๊ณ„๊ธฐ ๋ถˆ์š”ํŒŒ ๋ฐœ์ƒ ์›์ธ์€?",
89
- )
90
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
91
- btn = gr.Button("๋ถ„์„ ์‹œ์ž‘")
92
-
93
  btn.click(chatbot, inputs=query, outputs=output)
94
 
95
  if __name__ == "__main__":
 
1
 
2
  import os
3
  import pdfplumber
4
+ from huggingface_hub import hf_hub_download
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
7
  from langchain_community.vectorstores import FAISS
 
10
 
11
  # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
+ raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. (Spaces > Settings > Repository secrets)")
 
14
 
15
+ # 2. ๋ชจ๋ธ ์„ค์ •
16
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
17
  llm = HuggingFaceEndpoint(
18
  repo_id=repo_id,
 
21
  task="text-generation"
22
  )
23
 
24
+ # 3. ๋ฐ์ดํ„ฐ์…‹ (PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ)
25
  dataset_repo = "dgmos/ericsson-manuals"
26
+ pdf_files = [
27
+ "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
28
+ "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
29
+ "(20230531) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 2 (5G)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
30
+ "(20240924) ๋Œ€๊ตฌ์šด์šฉ๋ถ€ TEST BED๋ฅผ ํ™œ์šฉ์„ ํ†ตํ•œ ์ค‘๊ณ„๊ธฐ ์—ญ๋Ÿ‰ํ–ฅ์ƒ_OCR.pdf",
31
+ "(7์›”3์ฃผ)WiFi๊ต์œก(๋ณธ๋ถ€ ๊ณต์œ )_OCR.pdf",
32
+ "1. 5G ๊ธฐ์ง€๊ตญ ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver4.2_OCR.pdf",
33
+ "1. kt MOS ๋‚จ๋ถ€ ๊ธฐ์ˆ (๊ธฐ์ง€๊ตญ) ํ•™์Šต๊ต์žฌ_Ver 5.2_OCR.pdf",
34
+ "10G PON ๋ฐ SFP๋ชจ๋“ˆ ๊ตฌ๋ถ„_OCR.pdf",
35
+ "2. 5G ์ค‘๊ณ„๊ธฐ ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver3.6_OCR.pdf",
36
+ "2. kt MOS ๋‚จ๋ถ€ ๊ธฐ์ˆ (์ค‘๊ณ„๊ธฐ) ํ•™์Šต๊ต์žฌ_Ver 4.5_OCR.pdf",
37
+ "23๋…„ ๋ฌด์„  ์•ก์„ธ์Šค ๊ต์œก ๊ต์•ˆ(์ค‘๊ณ„๊ธฐ์™„์ „์ •๋ณต_์žฅ๋น„์šด์šฉ๊ธฐ์ค€)_OCR.pdf",
38
+ "3. kt MOS ๋‚จ๋ถ€ ๋„คํŠธ์›Œํฌ ํ•™์Šต๊ต์žฌ_Ver 4.7_OCR.pdf",
39
+ "3. ์žฌ๋‚œ์•ˆ์ „ํ†ต์‹ ๋ง ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver3.1_OCR.pdf",
40
+ "5G๊ด‘์ค‘๊ณ„๊ธฐ ์šด์šฉ ๊ถŒ๊ณ ์•ˆV1.0(20200615)_OCR.pdf",
41
+ "Barcode System WEB ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ(์›๋ณธ)_์ˆ˜์ •_OCR.pdf",
42
+ "ELMO(์ ๋ฆฌํ…Œํฌ)_OCR.pdf",
43
+ "ELORHA ๊ต์œก์ž๋ฃŒ_OCR.pdf",
44
+ "JD745B ๊ฐ„ํŽธ ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ-Viavi (Rev1.0)_OCR.pdf",
45
+ "KELIS(์ ๋ฆฌํ…Œํฌ)_OCR.pdf",
46
+ "MMF ์‚ฌ์—…์žฅ ๊ด‘๋ ˆ๋ฒจ ์ •๋น„ ์ž๋ฃŒ)_OCR.pdf",
47
+ "MS2090A ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ_9.4.1(20200612)Trigger_OCR.pdf",
48
+ "Test bed.(์ค‘๊ณ„๊ธฐ ๊ต์•ˆ)pptx_OCR.pdf",
49
+ "V2824_IMN_KO_080718_OCR.pdf",
50
+ "V2824_UMN_1.03_KO_081217_OCR.pdf",
51
+ "[IP-A]ONE-MUX๊ธฐ์ˆ ๋ฐฉ์‹ ๋ฐ ํ˜„์žฅ ์„ ๋กœ ๊ณ ์žฅ์ ๊ฒ€ Guide_OCR.pdf",
52
+ "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ) 1 IP ๋„คํŠธ์›Œํฌ KT ๋ง ๊ตฌ์กฐ ์ดํ•ด_OCR.pdf",
53
+ "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ)_2 IP Access L2์šด์šฉ ๋ฐ ๊ธฐ์ˆ ๋ฐฉ์‹_OCR.pdf",
54
+ "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ)_3 OLT ์šด์šฉ ๋ฐ ๊ธฐ์ˆ ๋ฐฉ์‹_OCR.pdf",
55
+ "[๊ตฌํ˜• ์—˜๋กœํ•˜] ์ œํ’ˆ์†Œ๊ฐœ์„œ_OCR.pdf",
56
+ "[์‹ ํ˜• ์—˜๋กœํ•˜] ์ œํ’ˆ์†Œ๊ฐœ์„œ_OCR.pdf",
57
+ "[ํ˜„์žฅ2] IP ์•ก์„ธ์Šค ์ฝ˜์†”์ ‘์† ๋ฐ G.EasyOne(์žฅ๋น„์ดˆ๊ธฐํ™”)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
58
+ "[ํ˜„์žฅ3] IP ์•ก์„ธ์Šค A-SDN (์žฅ๋น„๊ต์ฒด)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
59
+ "[ํ˜„์žฅ5] IP ์•ก์„ธ์Šค ๊ธด๊ธ‰๋ณต๊ตฌ(๊ณ ์žฅ์‚ฌ๋ก€)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
60
+ "korea_mobile_frequencies.pdf",
61
+ "๋‹ค์‚ฐ ์ค‘์šฉ๋Ÿ‰ OLT V5832XG ๊ต์œก์ž๋ฃŒ_OCR.pdf",
62
+ "๋Œ€๊ตฌ๋ณธ๋ถ€_์ง€๊ตฌ๋ ฅ๊ฐ•ํ™”๊ต์œก_3์›” ๊ณ„์ธก๊ธฐํ™œ์šฉ๋ฐฉ๋ฒ•_OCR.pdf",
63
+ "๋™๊ณ„_์—ญ๋Ÿ‰๊ฐ•ํ™”_๊ต์œก_ํ•™์Šต๊ต์žฌ(์‹ค๋ฌด๊ณผ์ •_์ค‘๊ณ„๊ธฐ_๊ณ„์ธก๊ธฐ)_Ver3.0_OCR.pdf",
64
+ "์‹ค๋ฌด์ž ๊ต์œก(10์›”3์ฃผ)_์ค‘์šฉ๋Ÿ‰ OLT(๊น€๋ช…์œค)_OCR.pdf",
65
+ "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
66
+ ]
67
 
68
  # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
69
  docs = []
 
80
  ".join(texts).strip()
81
  if text:
82
  docs.append({"page_content": text, "metadata": {"source": filename}})
83
+ print(f"โœ… {filename} ์ฒ˜๋ฆฌ ์™„๋ฃŒ (๊ธธ์ด {len(text)}์ž)")
84
  else:
85
  print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {filename}")
86
  except Exception as e:
87
+ print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {filename} - {e}")
88
 
89
  if not docs:
90
+ raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
91
 
92
  # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
93
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
94
  texts = splitter.split_documents(docs)
95
 
96
+ # 6. ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ DB
97
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
 
 
98
  vectorstore = FAISS.from_documents(texts, embeddings)
99
 
100
+ # 7. RAG ์ฒด์ธ
101
  qa_chain = RetrievalQA.from_chain_type(
102
  llm=llm,
103
  chain_type="stuff",
 
105
  )
106
 
107
  # 8. ์ฑ—๋ด‡ ํ•จ์ˆ˜
108
+ def chatbot(query):
109
  try:
110
  response = qa_chain.run(query)
111
  return response
 
114
 
115
  # 9. Gradio UI
116
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
117
+ gr.Markdown("# ๐Ÿ“ก Ericsson ์žฅ๋น„ ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
118
+ gr.Markdown("์—…๋กœ๋“œ๋œ PDF ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
119
+ query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?", lines=2)
 
 
 
 
120
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
121
+ btn = gr.Button("๋ถ„์„ ์‹คํ–‰")
 
122
  btn.click(chatbot, inputs=query, outputs=output)
123
 
124
  if __name__ == "__main__":