dgmos commited on
Commit
541d9fe
ยท
1 Parent(s): 5028bb5

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +58 -69
app.py CHANGED
@@ -3,123 +3,112 @@ import os
3
  import pdfplumber
4
  from huggingface_hub import hf_hub_download
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
 
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
 
11
- # โœ… ํ™˜๊ฒฝ ๋ณ€์ˆ˜ (Secrets์—์„œ ์ž๋™ ์ฃผ์ž…)
 
 
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
- raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ์ด ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. Spaces โ†’ Settings โ†’ Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
14
 
15
- # โœ… LLM ๋ชจ๋ธ ์„ค์ •
 
 
 
 
 
16
  llm = HuggingFaceEndpoint(
17
- repo_id="meta-llama/Llama-3.2-3B-Instruct",
18
- huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
19
  temperature=0.7,
20
  task="text-generation"
21
  )
22
 
23
- # โœ… Hugging Face Dataset โ†’ PDF ๋‹ค์šด๋กœ๋“œ
24
- repo_id = "dgmos/ericsson-manuals"
 
25
  pdf_files = [
26
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
27
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
28
  "(20230531) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 2 (5G)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
29
  "(20240924) ๋Œ€๊ตฌ์šด์šฉ๋ถ€ TEST BED๋ฅผ ํ™œ์šฉ์„ ํ†ตํ•œ ์ค‘๊ณ„๊ธฐ ์—ญ๋Ÿ‰ํ–ฅ์ƒ_OCR.pdf",
30
- "(7์›”3์ฃผ)WiFi๊ต์œก(๋ณธ๋ถ€ ๊ณต์œ )_OCR.pdf",
31
- "1. 5G ๊ธฐ์ง€๊ตญ ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver4.2_OCR.pdf",
32
- "1. kt MOS ๋‚จ๋ถ€ ๊ธฐ์ˆ (๊ธฐ์ง€๊ตญ) ํ•™์Šต๊ต์žฌ_Ver 5.2_OCR.pdf",
33
- "10G PON ๋ฐ SFP๋ชจ๋“ˆ ๊ตฌ๋ถ„_OCR.pdf",
34
- "2. 5G ์ค‘๊ณ„๊ธฐ ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver3.6_OCR.pdf",
35
- "2. kt MOS ๋‚จ๋ถ€ ๊ธฐ์ˆ (์ค‘๊ณ„๊ธฐ) ํ•™์Šต๊ต์žฌ_Ver 4.5_OCR.pdf",
36
- "23๋…„ ๋ฌด์„  ์•ก์„ธ์Šค ๊ต์œก ๊ต์•ˆ(์ค‘๊ณ„๊ธฐ์™„์ „์ •๋ณต_์žฅ๋น„์šด์šฉ๊ธฐ์ค€)_OCR.pdf",
37
- "3. kt MOS ๋‚จ๋ถ€ ๋„คํŠธ์›Œํฌ ํ•™์Šต๊ต์žฌ_Ver 4.7_OCR.pdf",
38
- "3. ์žฌ๋‚œ์•ˆ์ „ํ†ต์‹ ๋ง ๊ธฐ์ˆ  ํ•™์Šต๊ต์žฌ_Ver3.1_OCR.pdf",
39
- "5G๊ด‘์ค‘๊ณ„๊ธฐ ์šด์šฉ ๊ถŒ๊ณ ์•ˆV1.0(20200615)_OCR.pdf",
40
- "Barcode System WEB ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ(์›๋ณธ)_์ˆ˜์ •_OCR.pdf",
41
- "ELMO(์ ๋ฆฌํ…Œํฌ)_OCR.pdf",
42
- "ELORHA ๊ต์œก์ž๋ฃŒ_OCR.pdf",
43
- "JD745B ๊ฐ„ํŽธ ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ-Viavi (Rev1.0)_OCR.pdf",
44
- "KELIS(์ ๋ฆฌํ…Œํฌ)_OCR.pdf",
45
- "MMF ์‚ฌ์—…์žฅ ๊ด‘๋ ˆ๋ฒจ ์ •๋น„ ์ž๋ฃŒ)_OCR.pdf",
46
- "MS2090A ์‚ฌ์šฉ์ž ๋งค๋‰ด์–ผ_9.4.1(20200612)Trigger_OCR.pdf",
47
- "Test bed.(์ค‘๊ณ„๊ธฐ ๊ต์•ˆ)pptx_OCR.pdf",
48
- "V2824_IMN_KO_080718_OCR.pdf",
49
- "V2824_UMN_1.03_KO_081217_OCR.pdf",
50
- "[IP-A]ONE-MUX๊ธฐ์ˆ ๋ฐฉ์‹ ๋ฐ ํ˜„์žฅ ์„ ๋กœ ๊ณ ์žฅ์ ๊ฒ€ Guide_OCR.pdf",
51
- "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ) 1 IP ๋„คํŠธ์›Œํฌ KT ๋ง ๊ตฌ์กฐ ์ดํ•ด_OCR.pdf",
52
- "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ)_2 IP Access L2์šด์šฉ ๋ฐ ๊ธฐ์ˆ ๋ฐฉ์‹_OCR.pdf",
53
- "[ktMOS๋‚จ๋ถ€]_24๋…„(์œ ์„ ๋ถ„์•ผ)_3 OLT ์šด์šฉ ๋ฐ ๊ธฐ์ˆ ๋ฐฉ์‹_OCR.pdf",
54
- "[๊ตฌํ˜• ์—˜๋กœํ•˜] ์ œํ’ˆ์†Œ๊ฐœ์„œ_OCR.pdf",
55
- "[์‹ ํ˜• ์—˜๋กœํ•˜] ์ œํ’ˆ์†Œ๊ฐœ์„œ_OCR.pdf",
56
- "[ํ˜„์žฅ2] IP ์•ก์„ธ์Šค ์ฝ˜์†”์ ‘์† ๋ฐ G.EasyOne(์žฅ๋น„์ดˆ๊ธฐํ™”)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
57
- "[ํ˜„์žฅ3] IP ์•ก์„ธ์Šค A-SDN (์žฅ๋น„๊ต์ฒด)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
58
- "[ํ˜„์žฅ5] IP ์•ก์„ธ์Šค ๊ธด๊ธ‰๋ณต๊ตฌ(๊ณ ์žฅ์‚ฌ๋ก€)_KT MOS_๋‚จ๋ถ€_OCR.pdf",
59
- "korea_mobile_frequencies.pdf",
60
- "๋‹ค์‚ฐ ์ค‘์šฉ๋Ÿ‰ OLT V5832XG ๊ต์œก์ž๋ฃŒ_OCR.pdf",
61
- "๋Œ€๊ตฌ๋ณธ๋ถ€_์ง€๊ตฌ๋ ฅ๊ฐ•ํ™”๊ต์œก_3์›” ๊ณ„์ธก๊ธฐํ™œ์šฉ๋ฐฉ๋ฒ•_OCR.pdf",
62
- "๋™๊ณ„_์—ญ๋Ÿ‰๊ฐ•ํ™”_๊ต์œก_ํ•™์Šต๊ต์žฌ(์‹ค๋ฌด๊ณผ์ •_์ค‘๊ณ„๊ธฐ_๊ณ„์ธก๊ธฐ)_Ver3.0_OCR.pdf",
63
- "์‹ค๋ฌด์ž ๊ต์œก(10์›”3์ฃผ)_์ค‘์šฉ๋Ÿ‰ OLT(๊น€๋ช…์œค)_OCR.pdf",
64
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
65
  ]
66
 
 
 
67
  docs = []
68
 
69
- for fname in pdf_files:
70
  try:
71
- pdf_path = hf_hub_download(repo_id=repo_id, filename=fname, repo_type="dataset")
72
- text_content = ""
 
73
  with pdfplumber.open(pdf_path) as pdf:
74
- for page_num, page in enumerate(pdf.pages, start=1):
75
- try:
76
- content = page.extract_text()
77
- if content:
78
- text_content += content + "
79
- "
80
- except Exception as e:
81
- print(f"โš ๏ธ PDF ํŒŒ์‹ฑ ์˜ค๋ฅ˜ (๋ฌด์‹œ): {fname} p.{page_num} - {str(e)}")
82
- if text_content.strip():
83
- docs.append({"page_content": text_content.strip()})
84
- print(f"โœ… ํ…์ŠคํŠธ ์ถ”์ถœ ์„ฑ๊ณต: {fname}")
85
  else:
86
- print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {fname}")
 
87
  except Exception as e:
88
- print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {fname} - {str(e)}")
89
 
90
- # โœ… ๋ฌธ์„œ ๊ฒ€์ฆ
91
  if not docs:
92
- raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
93
 
94
- # โœ… ํ…์ŠคํŠธ ๋ถ„ํ• 
 
 
95
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
96
  texts = splitter.split_documents(docs)
97
 
98
- # โœ… ๋ฒกํ„ฐ DB ๊ตฌ์ถ•
 
 
99
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
100
  vectorstore = FAISS.from_documents(texts, embeddings)
101
 
102
- # โœ… RAG ์ฒด์ธ
 
 
103
  qa_chain = RetrievalQA.from_chain_type(
104
  llm=llm,
105
  chain_type="stuff",
106
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
107
  )
108
 
109
- # โœ… ์ฑ—๋ด‡ ํ•จ์ˆ˜
 
 
110
  def chatbot(query: str):
111
  try:
112
- return qa_chain.run(query)
 
113
  except Exception as e:
114
- return f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
115
 
116
- # โœ… Gradio UI
 
 
117
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
118
- gr.Markdown("## ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
119
- gr.Markdown("Hugging Face Dataset์—์„œ OCR PDF ๊ธฐ๋ฐ˜ ์งˆ๋ฌธ ์‘๋‹ต ์ œ๊ณต")
 
120
  query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?")
121
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
122
  btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
 
123
  btn.click(chatbot, inputs=query, outputs=output)
124
 
125
  if __name__ == "__main__":
 
3
  import pdfplumber
4
  from huggingface_hub import hf_hub_download
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
+ from langchain_huggingface import HuggingFaceEndpoint
9
  from langchain.chains import RetrievalQA
10
  import gradio as gr
11
 
12
+ # =========================
13
+ # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
14
+ # =========================
15
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
16
+ raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. Hugging Face Space Secrets์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
17
 
18
+ HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
+
20
+ # =========================
21
+ # 2. LLM ๋ชจ๋ธ ์„ค์ •
22
+ # =========================
23
+ repo_id = "meta-llama/Llama-3.2-3B-Instruct"
24
  llm = HuggingFaceEndpoint(
25
+ repo_id=repo_id,
26
+ huggingfacehub_api_token=HF_TOKEN,
27
  temperature=0.7,
28
  task="text-generation"
29
  )
30
 
31
+ # =========================
32
+ # 3. PDF ๋ฐ์ดํ„ฐ์…‹ ๋‹ค์šด๋กœ๋“œ
33
+ # =========================
34
  pdf_files = [
35
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
36
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
37
  "(20230531) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 2 (5G)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
38
  "(20240924) ๋Œ€๊ตฌ์šด์šฉ๋ถ€ TEST BED๋ฅผ ํ™œ์šฉ์„ ํ†ตํ•œ ์ค‘๊ณ„๊ธฐ ์—ญ๋Ÿ‰ํ–ฅ์ƒ_OCR.pdf",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
40
  ]
41
 
42
+ repo_id_dataset = "dgmos/ericsson-manuals"
43
+
44
  docs = []
45
 
46
+ for pdf_name in pdf_files:
47
  try:
48
+ pdf_path = hf_hub_download(repo_id=repo_id_dataset, filename=pdf_name, token=HF_TOKEN)
49
+ text_content = []
50
+
51
  with pdfplumber.open(pdf_path) as pdf:
52
+ for page in pdf.pages:
53
+ content = page.extract_text()
54
+ if content:
55
+ text_content.append(content)
56
+
57
+ text = "
58
+ ".join(text_content) # โ† ์ค„๋ฐ”๊ฟˆ ์•ˆ์ „ ์ฒ˜๋ฆฌ ํ™•์ •
59
+ if text.strip():
60
+ docs.append({"page_content": text})
 
 
61
  else:
62
+ print(f"โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ: {pdf_name}")
63
+
64
  except Exception as e:
65
+ print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {pdf_name} - {str(e)}")
66
 
 
67
  if not docs:
68
+ raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Œ)")
69
 
70
+ # =========================
71
+ # 4. ํ…์ŠคํŠธ ๋ถ„ํ• 
72
+ # =========================
73
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
74
  texts = splitter.split_documents(docs)
75
 
76
+ # =========================
77
+ # 5. ๋ฒกํ„ฐ DB ์ƒ์„ฑ
78
+ # =========================
79
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
80
  vectorstore = FAISS.from_documents(texts, embeddings)
81
 
82
+ # =========================
83
+ # 6. RAG QA ์ฒด์ธ ์ƒ์„ฑ
84
+ # =========================
85
  qa_chain = RetrievalQA.from_chain_type(
86
  llm=llm,
87
  chain_type="stuff",
88
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
89
  )
90
 
91
+ # =========================
92
+ # 7. ์ฑ—๋ด‡ ํ•จ์ˆ˜
93
+ # =========================
94
  def chatbot(query: str):
95
  try:
96
+ response = qa_chain.run(query)
97
+ return response
98
  except Exception as e:
99
+ return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
100
 
101
+ # =========================
102
+ # 8. Gradio UI
103
+ # =========================
104
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
105
+ gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡")
106
+ gr.Markdown("์—…๋กœ๋“œ๋œ Ericsson PDF ๋งค๋‰ด์–ผ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ์›์ธ ๋ฐ ์šด์šฉ ๋งค๋‰ด์–ผ ๊ฒ€์ƒ‰ ์ง€์›")
107
+
108
  query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?")
109
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
110
  btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
111
+
112
  btn.click(chatbot, inputs=query, outputs=output)
113
 
114
  if __name__ == "__main__":