dgmos commited on
Commit
0950d65
ยท
1 Parent(s): 10c9ad8

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +35 -40
app.py CHANGED
@@ -1,23 +1,19 @@
1
 
2
  import os
3
- from datasets import load_dataset
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
- from langchain_huggingface import HuggingFaceEndpoint
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
- import pdfplumber
11
 
12
  # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
13
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
14
- raise ValueError(
15
- "โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. "
16
- "HF Space Settings > Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”."
17
- )
18
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
 
20
- # 2. ๋ชจ๋ธ ์„ค์ •
21
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
22
  llm = HuggingFaceEndpoint(
23
  repo_id=repo_id,
@@ -26,46 +22,45 @@ llm = HuggingFaceEndpoint(
26
  task="text-generation"
27
  )
28
 
29
- # 3. Hugging Face Datasets ๋กœ๋“œ
30
- print("๐Ÿ“‚ Hugging Face Datasets ๋กœ๋”ฉ ์ค‘...")
31
- dataset = load_dataset("dgmos/ericsson-manuals", split="train")
 
 
 
32
 
33
  # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
34
  docs = []
35
- for item in dataset:
36
- pdf_path = item.get("file") or item.get("path") or None
37
- if not pdf_path:
38
- print(f"โš ๏ธ PDF ๊ฒฝ๋กœ ์—†์Œ: {item}")
39
- continue
40
-
41
  try:
42
- pages = []
43
  with pdfplumber.open(pdf_path) as pdf:
 
44
  for page in pdf.pages:
45
  content = page.extract_text()
46
  if content:
47
- pages.append(content)
48
-
49
- # โœ… ์ค„๋ฐ”๊ฟˆ ์•ˆ์ „ ๋ฒ„์ „
50
- text = "\n".join(pages).strip()
51
-
52
- if text:
53
- docs.append({"page_content": text})
54
- else:
55
- print(f"โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ: {pdf_path}")
56
-
57
  except Exception as e:
58
- print(f"๐Ÿšจ PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
59
- continue
60
 
61
- print(f"โœ… ์ด {len(docs)} ๊ฐœ PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์™„๋ฃŒ")
 
62
 
63
  # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
64
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
65
  texts = splitter.split_documents(docs)
66
 
67
- # 6. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐ DB
68
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
 
 
69
  vectorstore = FAISS.from_documents(texts, embeddings)
70
 
71
  # 7. Retrieval QA ์ฒด์ธ
@@ -81,19 +76,19 @@ def chatbot(query: str):
81
  response = qa_chain.run(query)
82
  return response
83
  except Exception as e:
84
- return f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
85
 
86
  # 9. Gradio UI
87
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
88
- gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
89
- gr.Markdown("Hugging Face Datasets(`dgmos/ericsson-manuals`)์— ์—…๋กœ๋“œ๋œ **OCR PDF ๋งค๋‰ด์–ผ**์„ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ์˜์‘๋‹ต์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.")
90
 
91
  query = gr.Textbox(
92
  label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)",
93
- placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?",
94
  )
95
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
96
- btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
97
 
98
  btn.click(chatbot, inputs=query, outputs=output)
99
 
 
1
 
2
  import os
3
+ import pdfplumber
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
7
  from langchain_community.vectorstores import FAISS
 
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
 
10
 
11
  # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
12
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
13
+ raise ValueError("โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. "
14
+ "HF Space โ†’ Settings โ†’ Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
 
 
 
15
 
16
+ # 2. LLM ๋ชจ๋ธ (Hugging Face Inference API)
17
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
18
  llm = HuggingFaceEndpoint(
19
  repo_id=repo_id,
 
22
  task="text-generation"
23
  )
24
 
25
+ # 3. ๋ฐ์ดํ„ฐ์…‹์—์„œ PDF ํŒŒ์ผ ๋ฆฌ์ŠคํŠธ ์ž๋™ ์ˆ˜์ง‘
26
+ dataset_repo = "dgmos/ericsson-manuals"
27
+ all_files = list_repo_files(dataset_repo)
28
+ pdf_files = [f for f in all_files if f.lower().endswith(".pdf")]
29
+
30
+ print(f"๐Ÿ“‚ ์ด {len(pdf_files)} ๊ฐœ PDF ๊ฐ์ง€๋จ")
31
 
32
  # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
33
  docs = []
34
+ for filename in pdf_files:
 
 
 
 
 
35
  try:
36
+ pdf_path = hf_hub_download(repo_id=dataset_repo, filename=filename)
37
  with pdfplumber.open(pdf_path) as pdf:
38
+ texts = []
39
  for page in pdf.pages:
40
  content = page.extract_text()
41
  if content:
42
+ texts.append(content)
43
+ text = "
44
+ ".join(texts).strip()
45
+ if text:
46
+ docs.append({"page_content": text, "metadata": {"source": filename}})
47
+ print(f"โœ… {filename} ์ฒ˜๋ฆฌ ์™„๋ฃŒ (๊ธธ์ด: {len(text)}์ž)")
48
+ else:
49
+ print(f"โš ๏ธ ํ…์ŠคํŠธ ์—†์Œ: {filename}")
 
 
50
  except Exception as e:
51
+ print(f"๐Ÿšจ PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {filename} - {e}")
 
52
 
53
+ if not docs:
54
+ raise RuntimeError("โŒ PDF์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ํ™•์ธ ํ•„์š”!")
55
 
56
  # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
57
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
58
  texts = splitter.split_documents(docs)
59
 
60
+ # 6. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐDB
61
+ embeddings = HuggingFaceEmbeddings(
62
+ model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
63
+ )
64
  vectorstore = FAISS.from_documents(texts, embeddings)
65
 
66
  # 7. Retrieval QA ์ฒด์ธ
 
76
  response = qa_chain.run(query)
77
  return response
78
  except Exception as e:
79
+ return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
80
 
81
  # 9. Gradio UI
82
  with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
83
+ gr.Markdown("# ๐Ÿ“ก Ericsson LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
84
+ gr.Markdown("Hugging Face Hub์— ์ €์žฅ๋œ **๋ชจ๋“  PDF**๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•ฉ๋‹ˆ๋‹ค.")
85
 
86
  query = gr.Textbox(
87
  label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)",
88
+ placeholder="์˜ˆ: 5G ์ค‘๊ณ„๊ธฐ ๋ถˆ์š”ํŒŒ ๋ฐœ์ƒ ์›์ธ์€?",
89
  )
90
  output = gr.Textbox(label="์‘๋‹ต", lines=10)
91
+ btn = gr.Button("๋ถ„์„ ์‹œ์ž‘")
92
 
93
  btn.click(chatbot, inputs=query, outputs=output)
94