dgmos commited on
Commit
f2abe6e
ยท
1 Parent(s): d5cf1a6

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +34 -30
app.py CHANGED
@@ -1,18 +1,19 @@
1
 
2
  import os
3
  import pdfplumber
4
- from huggingface_hub import hf_hub_download
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.chains import RetrievalQA
9
- import gradio as gr
10
 
11
- # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
- raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. (Spaces > Settings > Repository secrets)")
 
14
 
15
- # 2. ๋ชจ๋ธ ์„ค์ •
16
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
17
  llm = HuggingFaceEndpoint(
18
  repo_id=repo_id,
@@ -21,8 +22,10 @@ llm = HuggingFaceEndpoint(
21
  task="text-generation"
22
  )
23
 
24
- # 3. ๋ฐ์ดํ„ฐ์…‹ (PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ)
25
  dataset_repo = "dgmos/ericsson-manuals"
 
 
26
  pdf_files = [
27
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
28
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
@@ -65,59 +68,60 @@ pdf_files = [
65
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
66
  ]
67
 
68
- # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
69
  docs = []
70
- for filename in pdf_files:
71
  try:
72
- pdf_path = hf_hub_download(repo_id=dataset_repo, filename=filename)
 
73
  with pdfplumber.open(pdf_path) as pdf:
74
- texts = []
75
  for page in pdf.pages:
76
  content = page.extract_text()
77
  if content:
78
  texts.append(content)
79
- text = "\n".join(texts).strip()
80
- if text:
81
- docs.append({"page_content": text, "metadata": {"source": filename}})
82
- print(f"โœ… {filename} ์ฒ˜๋ฆฌ ์™„๋ฃŒ (๊ธธ์ด {len(text)}์ž)")
83
- else:
84
- print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {filename}")
85
  except Exception as e:
86
- print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {filename} - {e}")
87
 
88
  if not docs:
89
  raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
90
 
91
- # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
 
 
92
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
93
  texts = splitter.split_documents(docs)
94
 
95
- # 6. ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ DB
96
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
97
  vectorstore = FAISS.from_documents(texts, embeddings)
98
 
99
- # 7. RAG ์ฒด์ธ
100
  qa_chain = RetrievalQA.from_chain_type(
101
  llm=llm,
102
  chain_type="stuff",
103
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
104
  )
105
 
106
- # 8. ์ฑ—๋ด‡ ํ•จ์ˆ˜
107
- def chatbot(query):
108
  try:
109
- response = qa_chain.run(query)
110
- return response
111
  except Exception as e:
112
  return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
113
 
114
- # 9. Gradio UI
115
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
116
- gr.Markdown("# ๐Ÿ“ก Ericsson ์žฅ๋น„ ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
117
- gr.Markdown("์—…๋กœ๋“œ๋œ PDF ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
118
- query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?", lines=2)
119
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
120
- btn = gr.Button("๋ถ„์„ ์‹คํ–‰")
121
  btn.click(chatbot, inputs=query, outputs=output)
122
 
123
  if __name__ == "__main__":
 
1
 
2
  import os
3
  import pdfplumber
4
+ import gradio as gr
5
+ from huggingface_hub import hf_hub_download, login
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain.chains import RetrievalQA
 
10
 
11
+ # 1. Hugging Face ์ธ์ฆ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
+ raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. Spaces โ†’ Settings โ†’ Repository secrets ์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
14
+ login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN"))
15
 
16
+ # 2. LLM ๋ชจ๋ธ ์„ค์ •
17
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
18
  llm = HuggingFaceEndpoint(
19
  repo_id=repo_id,
 
22
  task="text-generation"
23
  )
24
 
25
+ # 3. ๋Œ€์ƒ ๋ฐ์ดํ„ฐ์…‹ Repo ์ •๋ณด
26
  dataset_repo = "dgmos/ericsson-manuals"
27
+
28
+ # 4. ์ฒ˜๋ฆฌํ•  PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ (๋ฐ์ดํ„ฐ์…‹์— ์˜ฌ๋ผ๊ฐ„ ์‹ค์ œ ํŒŒ์ผ๋ช…๊ณผ ๋™์ผํ•ด์•ผ ํ•จ)
29
  pdf_files = [
30
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
31
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
 
68
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
69
  ]
70
 
71
+ # 5. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
72
  docs = []
73
+ for fname in pdf_files:
74
  try:
75
+ pdf_path = hf_hub_download(repo_id=dataset_repo, repo_type="dataset", filename=fname)
76
+ texts = []
77
  with pdfplumber.open(pdf_path) as pdf:
 
78
  for page in pdf.pages:
79
  content = page.extract_text()
80
  if content:
81
  texts.append(content)
82
+ text = "
83
+ ".join(texts).strip()
84
+ if text:
85
+ docs.append({"page_content": text, "metadata": {"source": fname}})
86
+ else:
87
+ print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {fname}")
88
  except Exception as e:
89
+ print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {fname} - {str(e)}")
90
 
91
  if not docs:
92
  raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
93
 
94
+ print(f"โœ… ์ด {len(docs)} ๊ฐœ PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์™„๋ฃŒ")
95
+
96
+ # 6. ํ…์ŠคํŠธ ๋ถ„ํ• 
97
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
98
  texts = splitter.split_documents(docs)
99
 
100
+ # 7. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐ DB
101
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
102
  vectorstore = FAISS.from_documents(texts, embeddings)
103
 
104
+ # 8. Retrieval QA ์ฒด์ธ
105
  qa_chain = RetrievalQA.from_chain_type(
106
  llm=llm,
107
  chain_type="stuff",
108
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
109
  )
110
 
111
+ # 9. ์ฑ—๋ด‡ ํ•จ์ˆ˜
112
+ def chatbot(query: str):
113
  try:
114
+ return qa_chain.run(query)
 
115
  except Exception as e:
116
  return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
117
 
118
+ # 10. Gradio UI
119
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
120
+ gr.Markdown("# ๐Ÿš€ Ericsson 3G/LTE/5G ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
121
+ gr.Markdown("Hugging Face Datasets์˜ PDF ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜ RAG QA")
122
+ query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?")
123
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
124
+ btn = gr.Button("๋ถ„์„ ์‹œ์ž‘")
125
  btn.click(chatbot, inputs=query, outputs=output)
126
 
127
  if __name__ == "__main__":