dgmos commited on
Commit
ad0b19c
ยท
1 Parent(s): 6d76ee3

Deploy chatbot update

Browse files
Files changed (1) hide show
  1. app.py +66 -112
app.py CHANGED
@@ -1,146 +1,100 @@
1
 
2
  import os
3
- import io
4
  from datasets import load_dataset
5
- import pdfplumber
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_huggingface import HuggingFaceEndpoint
10
  from langchain.chains import RetrievalQA
11
  import gradio as gr
 
12
 
13
- # --- 0. ํ•„์ˆ˜: HF ํ† ํฐ์ด Space Secrets์— ์„ค์ •๋˜์–ด ์žˆ์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
14
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
15
- raise RuntimeError("HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. Space Settings โ†’ Repository secrets ์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
 
 
 
 
16
 
17
- HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
18
-
19
- # --- 1. LLM ์„ค์ • (ํ•„์š”์‹œ ๋‹ค๋ฅธ ๋ชจ๋ธ๋กœ ๋ณ€๊ฒฝ)
20
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
21
- llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=HF_TOKEN, temperature=0.2, task="text-generation")
22
-
23
- # --- 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
24
- ds_name = "dgmos/ericsson-manuals"
25
- dataset = load_dataset(ds_name, split="train") # train split ์‚ฌ์šฉ (์—…๋กœ๋“œํ•œ ํŒŒ์ผ์€ ๋ณดํ†ต train์— ์žˆ์Œ)
26
-
27
- # --- 3. ๋ฐ์ดํ„ฐ์…‹ ์นผ๋Ÿผ ํ™•์ธ (๋””๋ฒ„๊ทธ ๋กœ๊ทธ)
28
- print("Dataset columns:", dataset.column_names)
29
- if len(dataset) > 0:
30
- print("Sample record keys:", list(dataset[0].keys()))
31
-
32
- # --- 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์œ ํ‹ธ๋ฆฌํ‹ฐ (์—ฌ๋Ÿฌ ์ผ€์ด์Šค ์ฒ˜๋ฆฌ)
33
- def extract_text_from_record(record):
34
- """
35
- record: dataset row (dict-like)
36
- returns: extracted text (str) or None
37
- ์ฒ˜๋ฆฌ ์šฐ์„ ์ˆœ์œ„:
38
- 1) record['text'] (์ด๋ฏธ OCR/ํ…์ŠคํŠธ๋กœ ์˜ฌ๋ผ๊ฐ„ ๊ฒฝ์šฐ)
39
- 2) record['path'] ๋˜๋Š” record['file'] ๊ฐ€ ๋กœ์ปฌ ๊ฒฝ๋กœ์ผ ๋•Œ ํŒŒ์ผ ์—ด๊ธฐ
40
- 3) record['file'] ๋˜๋Š” record['bytes'] ๊ฐ€ ๋ฐ”์ด๋„ˆ๋ฆฌ(๋งคํ•‘ํ˜•)์ผ ๋•Œ BytesIO๋กœ ์—ด๊ธฐ
41
- """
42
- # 1) ์ด๋ฏธ text ์นผ๋Ÿผ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ์‚ฌ์šฉ
43
- if "text" in record and record["text"]:
44
- return record["text"]
45
-
46
- # 2) ๊ฒฝ๋กœ ๊ด€๋ จ ํ•„๋“œ ์ฒดํฌ
47
- for key in ("path", "file", "filename", "name"):
48
- if key in record and record[key]:
49
- val = record[key]
50
- # datasets ๋•Œ ๋กœ์ปฌ ๊ฒฝ๋กœ ๋ฌธ์ž์—ด๋กœ ์ œ๊ณต๋˜๋Š” ๊ฒฝ์šฐ
51
- if isinstance(val, str) and os.path.exists(val):
52
- try:
53
- with pdfplumber.open(val) as pdf:
54
- pages = [p.extract_text() or "" for p in pdf.pages]
55
- return "
56
- ".join(pages).strip()
57
- except Exception as e:
58
- print(f"Failed to open file path {val}: {e}")
59
- # ๊ณ„์† ๋‹ค์Œ ์ผ€์ด์Šค๋กœ
60
-
61
- # ์ผ๋ถ€ dataset์—์„œ file ํ•„๋“œ๊ฐ€ dict ํ˜•ํƒœ๋กœ path ํฌํ•จํ•˜๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ
62
- if isinstance(val, dict):
63
- # try path inside dict
64
- inner_path = val.get("path") or val.get("filename")
65
- if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
66
- try:
67
- with pdfplumber.open(inner_path) as pdf:
68
- pages = [p.extract_text() or "" for p in pdf.pages]
69
- return "
70
- ".join(pages).strip()
71
- except Exception as e:
72
- print(f"Failed to open inner path {inner_path}: {e}")
73
-
74
- # 3) bytes ํ˜•ํƒœ(field ์ด๋ฆ„์ด 'bytes' ์ด๊ฑฐ๋‚˜ file๊ฐ€ bytes) ์ฒ˜๋ฆฌ
75
- for key in ("bytes", "file", "content"):
76
- if key in record and record[key]:
77
- val = record[key]
78
- # datasets may store bytes as a bytes object or memoryview
79
- if isinstance(val, (bytes, bytearray, memoryview)):
80
- try:
81
- bio = io.BytesIO(bytes(val))
82
- with pdfplumber.open(bio) as pdf:
83
- pages = [p.extract_text() or "" for p in pdf.pages]
84
- return "
85
- ".join(pages).strip()
86
- except Exception as e:
87
- print(f"Failed to open bytes for key {key}: {e}")
88
- # sometimes it's a dict with 'bytes' inside
89
- if isinstance(val, dict) and ("bytes" in val):
90
- b = val["bytes"]
91
- try:
92
- bio = io.BytesIO(bytes(b))
93
- with pdfplumber.open(bio) as pdf:
94
- pages = [p.extract_text() or "" for p in pdf.pages]
95
- return "
96
- ".join(pages).strip()
97
- except Exception as e:
98
- print(f"Failed to open nested bytes for key {key}: {e}")
99
 
100
- # ๋ชป ์ฐพ์Œ
101
- return None
 
102
 
103
- # --- 5. ๋ชจ๋“  ๋ ˆ์ฝ”๋“œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ (์ฃผ์˜: ํŒŒ์ผ ์ˆ˜/ํฌ๊ธฐ ๋งŽ์œผ๋ฉด ์‹œ๊ฐ„ ์†Œ์š”)
104
  docs = []
105
- for i, rec in enumerate(dataset):
106
- text = extract_text_from_record(rec)
107
- if text:
108
- docs.append({"page_content": text})
109
- else:
110
- print(f"โš ๏ธ ๋ ˆ์ฝ”๋“œ {i}์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. keys={list(rec.keys())}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- if not docs:
113
- raise RuntimeError("๋ฌธ์„œ์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
114
 
115
- # --- 6. ํ…์ŠคํŠธ ๋ถ„ํ•  ๋ฐ ์ž„๋ฒ ๋”ฉ
116
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
117
- docs_split = splitter.split_documents(docs)
118
 
 
119
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
120
- vectorstore = FAISS.from_documents(docs_split, embeddings)
121
 
122
- # --- 7. RAG ์ฒด์ธ ๊ตฌ์„ฑ
123
  qa_chain = RetrievalQA.from_chain_type(
124
  llm=llm,
125
  chain_type="stuff",
126
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
127
  )
128
 
129
- # --- 8. Gradio ์ธํ„ฐํŽ˜์ด์Šค
130
  def chatbot(query: str):
131
- if not query or not query.strip():
132
- return "์งˆ๋ฌธ์„ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”."
133
  try:
134
- return qa_chain.run(query)
 
135
  except Exception as e:
136
- return f"์˜ค๋ฅ˜: {e}"
137
-
138
- with gr.Blocks() as demo:
139
- gr.Markdown("## Ericsson ์žฅ๋น„ ๋งค๋‰ด์–ผ RAG ์ฑ—๋ด‡")
140
- q = gr.Textbox(label="์งˆ๋ฌธ (ํ•œ๊ตญ์–ด/์˜์–ด)")
141
- out = gr.Textbox(label="์‘๋‹ต", lines=10)
142
- btn = gr.Button("์งˆ์˜")
143
- btn.click(chatbot, inputs=q, outputs=out)
 
 
 
 
 
 
 
144
 
145
  if __name__ == "__main__":
146
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
 
2
  import os
 
3
  from datasets import load_dataset
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_huggingface import HuggingFaceEndpoint
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
+ import pdfplumber
11
 
12
+ # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
13
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
14
+ raise ValueError(
15
+ "โŒ HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. "
16
+ "HF Space Settings > Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”."
17
+ )
18
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
 
20
+ # 2. ๋ชจ๋ธ ์„ค์ • (LLaMA-3.2 3B Instruct)
 
 
21
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
22
+ llm = HuggingFaceEndpoint(
23
+ repo_id=repo_id,
24
+ huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
25
+ temperature=0.7,
26
+ task="text-generation"
27
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # 3. Hugging Face Datasets ๋กœ๋“œ
30
+ print("๐Ÿ“‚ Hugging Face Datasets ๋กœ๋”ฉ ์ค‘...")
31
+ dataset = load_dataset("dgmos/ericsson-manuals", split="train")
32
 
33
+ # 4. PDF โ†’ ํ…์ŠคํŠธ ์ถ”์ถœ
34
  docs = []
35
+ for item in dataset:
36
+ pdf_path = item.get("file") or item.get("path") or None
37
+ if not pdf_path:
38
+ print(f"โš ๏ธ PDF ๊ฒฝ๋กœ ์—†์Œ: {item}")
39
+ continue
40
+
41
+ try:
42
+ pages = []
43
+ with pdfplumber.open(pdf_path) as pdf:
44
+ for page in pdf.pages:
45
+ content = page.extract_text()
46
+ if content:
47
+ pages.append(content)
48
+
49
+ text = "
50
+ ".join(pages).strip()
51
+ if text:
52
+ docs.append({"page_content": text})
53
+ else:
54
+ print(f"โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ: {pdf_path}")
55
+
56
+ except Exception as e:
57
+ print(f"๐Ÿšจ PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
58
+ continue
59
 
60
+ print(f"โœ… ์ด {len(docs)} ๊ฐœ PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์™„๋ฃŒ")
 
61
 
62
+ # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
63
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
64
+ texts = splitter.split_documents(docs)
65
 
66
+ # 6. ์ž„๋ฒ ๋”ฉ + ๋ฒกํ„ฐ DB
67
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
68
+ vectorstore = FAISS.from_documents(texts, embeddings)
69
 
70
+ # 7. Retrieval QA ์ฒด์ธ
71
  qa_chain = RetrievalQA.from_chain_type(
72
  llm=llm,
73
  chain_type="stuff",
74
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
75
  )
76
 
77
+ # 8. ์ฑ—๋ด‡ ํ•จ์ˆ˜
78
  def chatbot(query: str):
 
 
79
  try:
80
+ response = qa_chain.run(query)
81
+ return response
82
  except Exception as e:
83
+ return f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
84
+
85
+ # 9. Gradio UI
86
+ with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
87
+ gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
88
+ gr.Markdown("Hugging Face Datasets(`dgmos/ericsson-manuals`)์— ์—…๋กœ๋“œ๋œ **OCR PDF ๋งค๋‰ด์–ผ**์„ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ์˜์‘๋‹ต์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.")
89
+
90
+ query = gr.Textbox(
91
+ label="์งˆ๋ฌธ ์ž…๋ ฅ (ํ•œ๊ตญ์–ด/์˜์–ด)",
92
+ placeholder="์˜ˆ: Spurious Emission ์›์ธ์€?",
93
+ )
94
+ output = gr.Textbox(label="์‘๋‹ต", lines=10)
95
+ btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
96
+
97
+ btn.click(chatbot, inputs=query, outputs=output)
98
 
99
  if __name__ == "__main__":
100
  demo.launch(server_name="0.0.0.0", server_port=7860)