dgmos commited on
Commit
1a381aa
ยท
1 Parent(s): 8ec8702

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +32 -33
app.py CHANGED
@@ -1,31 +1,27 @@
1
 
2
  import os
3
  import pdfplumber
4
- import gradio as gr
5
- from huggingface_hub import hf_hub_download, login
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain.chains import RetrievalQA
 
10
 
11
- # 1. Hugging Face ์ธ์ฆ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
- raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. Spaces โ†’ Settings โ†’ Repository secrets ์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
14
- login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN"))
15
 
16
- # 2. LLM ๋ชจ๋ธ ์„ค์ •
17
- repo_id = "meta-llama/Llama-3.2-3B-Instruct"
18
  llm = HuggingFaceEndpoint(
19
- repo_id=repo_id,
20
- huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
21
  temperature=0.7,
22
  task="text-generation"
23
  )
24
 
25
- # 3. ๋Œ€์ƒ ๋ฐ์ดํ„ฐ์…‹ Repo ์ •๋ณด
26
- dataset_repo = "dgmos/ericsson-manuals"
27
-
28
- # 4. ์ฒ˜๋ฆฌํ•  PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ
29
  pdf_files = [
30
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
31
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
@@ -68,59 +64,62 @@ pdf_files = [
68
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
69
  ]
70
 
71
- # 5. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
72
  docs = []
 
73
  for fname in pdf_files:
74
  try:
75
- pdf_path = hf_hub_download(repo_id=dataset_repo, repo_type="dataset", filename=fname)
76
  texts = []
77
  with pdfplumber.open(pdf_path) as pdf:
78
- for page in pdf.pages:
79
- content = page.extract_text()
80
- if content:
81
- texts.append(content)
82
- text = "\n".join(texts).strip() # ์ค„๋ฐ”๊ฟˆ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ โ†’ ์ ˆ๋Œ€ ์ž๋™ ์ค„๋ฐ”๊ฟˆ ์•ˆ ์ƒ๊น€
83
- if text:
84
- docs.append({"page_content": text, "metadata": {"source": fname}})
 
 
 
 
85
  else:
86
  print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {fname}")
87
  except Exception as e:
88
  print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {fname} - {str(e)}")
89
 
 
90
  if not docs:
91
  raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
92
 
93
- print(f"โœ… ์ด {len(docs)} ๊ฐœ PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์™„๋ฃŒ")
94
-
95
- # 6. ํ…์ŠคํŠธ ๋ถ„ํ• 
96
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
97
  texts = splitter.split_documents(docs)
98
 
99
- # 7. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐ DB
100
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
101
  vectorstore = FAISS.from_documents(texts, embeddings)
102
 
103
- # 8. Retrieval QA ์ฒด์ธ
104
  qa_chain = RetrievalQA.from_chain_type(
105
  llm=llm,
106
  chain_type="stuff",
107
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
108
  )
109
 
110
- # 9. ์ฑ—๋ด‡ ํ•จ์ˆ˜
111
  def chatbot(query: str):
112
  try:
113
  return qa_chain.run(query)
114
  except Exception as e:
115
- return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
116
 
117
- # 10. Gradio UI
118
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
119
- gr.Markdown("# ๐Ÿš€ Ericsson 3G/LTE/5G ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
120
- gr.Markdown("Hugging Face Datasets์˜ PDF ๋งค๋‰ด์–ผ ๊ธฐ๋ฐ˜ RAG QA")
121
  query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?")
122
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
123
- btn = gr.Button("๋ถ„์„ ์‹œ์ž‘")
124
  btn.click(chatbot, inputs=query, outputs=output)
125
 
126
  if __name__ == "__main__":
 
1
 
2
  import os
3
  import pdfplumber
4
+ from huggingface_hub import hf_hub_download
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.chains import RetrievalQA
9
+ import gradio as gr
10
 
11
+ # โœ… ํ™˜๊ฒฝ ๋ณ€์ˆ˜ (Secrets์—์„œ ์ž๋™ ์ฃผ์ž…)
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
+ raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ์ด ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. Spaces โ†’ Settings โ†’ Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
 
14
 
15
+ # โœ… LLM ๋ชจ๋ธ ์„ค์ •
 
16
  llm = HuggingFaceEndpoint(
17
+ repo_id="meta-llama/Llama-3.2-3B-Instruct",
18
+ huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
19
  temperature=0.7,
20
  task="text-generation"
21
  )
22
 
23
+ # โœ… Hugging Face Datasets โ†’ PDF ๋‹ค์šด๋กœ๋“œ
24
+ repo_id = "dgmos/ericsson-manuals"
 
 
25
  pdf_files = [
26
  "(20220324) L2 Switch ์šด์šฉ ๋งค๋‰ด์–ผ_Innovation TF_Ver3.1_OCR.pdf",
27
  "(20230504) 23๋…„ ๊ธฐ์ˆ ๊ต์œก ๊ต์žฌ 1 (LTE)_๊ฐ€์น˜ํ˜์‹ ํŒ€_OCR.pdf",
 
64
  "์ฐจ๋‹จ๊ธฐ ์ข…๋ฅ˜ ๋ฐ ์šฉ๋„_OCR.pdf"
65
  ]
66
 
 
67
  docs = []
68
+
69
  for fname in pdf_files:
70
  try:
71
+ pdf_path = hf_hub_download(repo_id=repo_id, filename=fname, repo_type="dataset")
72
  texts = []
73
  with pdfplumber.open(pdf_path) as pdf:
74
+ for page_num, page in enumerate(pdf.pages, start=1):
75
+ try:
76
+ content = page.extract_text()
77
+ if content:
78
+ texts.append(content)
79
+ except Exception as e:
80
+ print(f"โš ๏ธ PDF ํŒŒ์‹ฑ ์˜ค๋ฅ˜ (๋ฌด์‹œ): {fname} p.{page_num} - {str(e)}")
81
+ if texts:
82
+ docs.append({"page_content": "
83
+ ".join(texts)})
84
+ print(f"โœ… ํ…์ŠคํŠธ ์ถ”์ถœ ์„ฑ๊ณต: {fname}")
85
  else:
86
  print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {fname}")
87
  except Exception as e:
88
  print(f"๐Ÿšจ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {fname} - {str(e)}")
89
 
90
+ # โœ… ๋ฌธ์„œ ๊ฒ€์ฆ
91
  if not docs:
92
  raise ValueError("โŒ PDF์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. (docs ๋ฆฌ์ŠคํŠธ ๋น„์–ด์žˆ์Œ)")
93
 
94
+ # โœ… ํ…์ŠคํŠธ ๋ถ„ํ• 
 
 
95
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
96
  texts = splitter.split_documents(docs)
97
 
98
+ # โœ… ๋ฒกํ„ฐ DB ๊ตฌ์ถ•
99
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
100
  vectorstore = FAISS.from_documents(texts, embeddings)
101
 
102
+ # โœ… RAG ์ฒด์ธ
103
  qa_chain = RetrievalQA.from_chain_type(
104
  llm=llm,
105
  chain_type="stuff",
106
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
107
  )
108
 
109
+ # โœ… ์ฑ—๋ด‡ ํ•จ์ˆ˜
110
  def chatbot(query: str):
111
  try:
112
  return qa_chain.run(query)
113
  except Exception as e:
114
+ return f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
115
 
116
+ # โœ… Gradio UI
117
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
118
+ gr.Markdown("## ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
119
+ gr.Markdown("Hugging Face Dataset์—์„œ OCR PDF ๊ธฐ๋ฐ˜ ์งˆ๋ฌธ ์‘๋‹ต ์ œ๊ณต")
120
  query = gr.Textbox(label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?")
121
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
122
+ btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
123
  btn.click(chatbot, inputs=query, outputs=output)
124
 
125
  if __name__ == "__main__":