dgmos commited on
Commit
6d76ee3
ยท
1 Parent(s): df786c4

Deploy chatbot update

Browse files
Files changed (2) hide show
  1. app.py +113 -48
  2. requirements.txt +2 -4
app.py CHANGED
@@ -1,81 +1,146 @@
1
 
2
  import os
 
3
  from datasets import load_dataset
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_huggingface import HuggingFaceEndpoint
8
  from langchain.chains import RetrievalQA
9
  import gradio as gr
10
- import pdfplumber # PDF ํ…์ŠคํŠธ ์ถ”์ถœ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ
11
 
12
- # 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •
13
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
14
- raise ValueError("HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. HF Space Settings > Secrets์—์„œ ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
15
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
 
17
- # 2. ๋ชจ๋ธ ์„ค์ •
 
 
18
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
19
- llm = HuggingFaceEndpoint(
20
- repo_id=repo_id,
21
- huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
22
- temperature=0.7,
23
- task="text-generation"
24
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # 3. Hugging Face Dataset ๋กœ๋“œ
27
- dataset = load_dataset("dgmos/ericsson-manuals", split="train")
 
 
 
 
 
 
 
 
 
 
28
 
29
- # 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  docs = []
31
- for item in dataset:
32
- # ์‹ค์ œ ํ•„๋“œ๋ช… ํ™•์ธ ํ•„์š” (์˜ˆ: "file", "path" ๋“ฑ)
33
- pdf_path = item.get("file") or item.get("path") or None
34
- if not pdf_path:
35
- print(f"โš ๏ธ PDF ๊ฒฝ๋กœ๊ฐ€ ์—†์Œ: {item}")
36
- continue
37
 
38
- try:
39
- with pdfplumber.open(pdf_path) as pdf:
40
- text = "
41
- ".join([page.extract_text() or "" for page in pdf.pages])
42
- if text.strip():
43
- docs.append({"page_content": text})
44
- except Exception as e:
45
- print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
46
- continue
47
 
48
- # 5. ํ…์ŠคํŠธ ๋ถ„ํ• 
49
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
50
- texts = splitter.split_documents(docs)
51
 
52
- # 6. ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ DB ์ƒ์„ฑ
53
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
54
- vectorstore = FAISS.from_documents(texts, embeddings)
55
 
56
- # 7. RAG ์ฒด์ธ ์ƒ์„ฑ
57
  qa_chain = RetrievalQA.from_chain_type(
58
  llm=llm,
59
  chain_type="stuff",
60
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
61
  )
62
 
63
- # 8. ์ฑ—๋ด‡ ํ•จ์ˆ˜
64
- def chatbot(query):
 
 
65
  try:
66
- response = qa_chain.run(query)
67
- return response
68
  except Exception as e:
69
- return f"์˜ค๋ฅ˜: {str(e)}."
70
-
71
- # 9. Gradio UI
72
- with gr.Blocks(title="Ericsson ์žฅ๋น„ ๋ถ„์„ ์ฑ—๋ด‡") as demo:
73
- gr.Markdown("# ๐Ÿš€ 3G/LTE/5G ์žฅ๋น„ ๋ถˆ๋Ÿ‰/๋ถˆ์š”ํŒŒ ๋ถ„์„ ์ฑ—๋ด‡")
74
- gr.Markdown("Hugging Face Dataset์—์„œ ๋กœ๋“œํ•œ PDF๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ๋งŒ ์ž…๋ ฅํ•˜์„ธ์š”!")
75
- query = gr.Textbox(label="์งˆ๋ฌธ (ํ•œ๊ตญ์–ด/์˜์–ด)", placeholder="Spurious Emission ์›์ธ์€?")
76
- output = gr.Textbox(label="์‘๋‹ต", lines=10)
77
- btn = gr.Button("๋ถ„์„ ์‹œ์ž‘!")
78
- btn.click(chatbot, inputs=query, outputs=output)
79
 
80
  if __name__ == "__main__":
81
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
 
2
  import os
3
+ import io
4
  from datasets import load_dataset
5
+ import pdfplumber
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_huggingface import HuggingFaceEndpoint
10
  from langchain.chains import RetrievalQA
11
  import gradio as gr
 
12
 
13
+ # --- 0. ํ•„์ˆ˜: HF ํ† ํฐ์ด Space Secrets์— ์„ค์ •๋˜์–ด ์žˆ์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
14
  if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
15
+ raise RuntimeError("HUGGINGFACEHUB_API_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. Space Settings โ†’ Repository secrets ์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.")
 
16
 
17
+ HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
18
+
19
+ # --- 1. LLM ์„ค์ • (ํ•„์š”์‹œ ๋‹ค๋ฅธ ๋ชจ๋ธ๋กœ ๋ณ€๊ฒฝ)
20
  repo_id = "meta-llama/Llama-3.2-3B-Instruct"
21
+ llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=HF_TOKEN, temperature=0.2, task="text-generation")
22
+
23
+ # --- 2. ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
24
+ ds_name = "dgmos/ericsson-manuals"
25
+ dataset = load_dataset(ds_name, split="train") # train split ์‚ฌ์šฉ (์—…๋กœ๋“œํ•œ ํŒŒ์ผ์€ ๋ณดํ†ต train์— ์žˆ์Œ)
26
+
27
+ # --- 3. ๋ฐ์ดํ„ฐ์…‹ ์นผ๋Ÿผ ํ™•์ธ (๋””๋ฒ„๊ทธ ๋กœ๊ทธ)
28
+ print("Dataset columns:", dataset.column_names)
29
+ if len(dataset) > 0:
30
+ print("Sample record keys:", list(dataset[0].keys()))
31
+
32
+ # --- 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์œ ํ‹ธ๋ฆฌํ‹ฐ (์—ฌ๋Ÿฌ ์ผ€์ด์Šค ์ฒ˜๋ฆฌ)
33
+ def extract_text_from_record(record):
34
+ """
35
+ record: dataset row (dict-like)
36
+ returns: extracted text (str) or None
37
+ ์ฒ˜๋ฆฌ ์šฐ์„ ์ˆœ์œ„:
38
+ 1) record['text'] (์ด๋ฏธ OCR/ํ…์ŠคํŠธ๋กœ ์˜ฌ๋ผ๊ฐ„ ๊ฒฝ์šฐ)
39
+ 2) record['path'] ๋˜๋Š” record['file'] ๊ฐ€ ๋กœ์ปฌ ๊ฒฝ๋กœ์ผ ๋•Œ ํŒŒ์ผ ์—ด๊ธฐ
40
+ 3) record['file'] ๋˜๋Š” record['bytes'] ๊ฐ€ ๋ฐ”์ด๋„ˆ๋ฆฌ(๋งคํ•‘ํ˜•)์ผ ๋•Œ BytesIO๋กœ ์—ด๊ธฐ
41
+ """
42
+ # 1) ์ด๋ฏธ text ์นผ๋Ÿผ์ด ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ์‚ฌ์šฉ
43
+ if "text" in record and record["text"]:
44
+ return record["text"]
45
+
46
+ # 2) ๊ฒฝ๋กœ ๊ด€๋ จ ํ•„๋“œ ์ฒดํฌ
47
+ for key in ("path", "file", "filename", "name"):
48
+ if key in record and record[key]:
49
+ val = record[key]
50
+ # datasets ๋•Œ ๋กœ์ปฌ ๊ฒฝ๋กœ ๋ฌธ์ž์—ด๋กœ ์ œ๊ณต๋˜๋Š” ๊ฒฝ์šฐ
51
+ if isinstance(val, str) and os.path.exists(val):
52
+ try:
53
+ with pdfplumber.open(val) as pdf:
54
+ pages = [p.extract_text() or "" for p in pdf.pages]
55
+ return "
56
+ ".join(pages).strip()
57
+ except Exception as e:
58
+ print(f"Failed to open file path {val}: {e}")
59
+ # ๊ณ„์† ๋‹ค์Œ ์ผ€์ด์Šค๋กœ
60
 
61
+ # ์ผ๋ถ€ dataset์—์„œ file ํ•„๋“œ๊ฐ€ dict ํ˜•ํƒœ๋กœ path ํฌํ•จํ•˜๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ
62
+ if isinstance(val, dict):
63
+ # try path inside dict
64
+ inner_path = val.get("path") or val.get("filename")
65
+ if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
66
+ try:
67
+ with pdfplumber.open(inner_path) as pdf:
68
+ pages = [p.extract_text() or "" for p in pdf.pages]
69
+ return "
70
+ ".join(pages).strip()
71
+ except Exception as e:
72
+ print(f"Failed to open inner path {inner_path}: {e}")
73
 
74
+ # 3) bytes ํ˜•ํƒœ(field ์ด๋ฆ„์ด 'bytes' ์ด๊ฑฐ๋‚˜ file๊ฐ€ bytes) ์ฒ˜๋ฆฌ
75
+ for key in ("bytes", "file", "content"):
76
+ if key in record and record[key]:
77
+ val = record[key]
78
+ # datasets may store bytes as a bytes object or memoryview
79
+ if isinstance(val, (bytes, bytearray, memoryview)):
80
+ try:
81
+ bio = io.BytesIO(bytes(val))
82
+ with pdfplumber.open(bio) as pdf:
83
+ pages = [p.extract_text() or "" for p in pdf.pages]
84
+ return "
85
+ ".join(pages).strip()
86
+ except Exception as e:
87
+ print(f"Failed to open bytes for key {key}: {e}")
88
+ # sometimes it's a dict with 'bytes' inside
89
+ if isinstance(val, dict) and ("bytes" in val):
90
+ b = val["bytes"]
91
+ try:
92
+ bio = io.BytesIO(bytes(b))
93
+ with pdfplumber.open(bio) as pdf:
94
+ pages = [p.extract_text() or "" for p in pdf.pages]
95
+ return "
96
+ ".join(pages).strip()
97
+ except Exception as e:
98
+ print(f"Failed to open nested bytes for key {key}: {e}")
99
+
100
+ # ๋ชป ์ฐพ์Œ
101
+ return None
102
+
103
+ # --- 5. ๋ชจ๋“  ๋ ˆ์ฝ”๋“œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ (์ฃผ์˜: ํŒŒ์ผ ์ˆ˜/ํฌ๊ธฐ ๋งŽ์œผ๋ฉด ์‹œ๊ฐ„ ์†Œ์š”)
104
  docs = []
105
+ for i, rec in enumerate(dataset):
106
+ text = extract_text_from_record(rec)
107
+ if text:
108
+ docs.append({"page_content": text})
109
+ else:
110
+ print(f"โš ๏ธ ๋ ˆ์ฝ”๋“œ {i}์—์„œ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. keys={list(rec.keys())}")
111
 
112
+ if not docs:
113
+ raise RuntimeError("๋ฌธ์„œ์—์„œ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
 
 
 
 
 
 
 
114
 
115
+ # --- 6. ํ…์ŠคํŠธ ๋ถ„ํ•  ๋ฐ ์ž„๋ฒ ๋”ฉ
116
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
117
+ docs_split = splitter.split_documents(docs)
118
 
 
119
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
120
+ vectorstore = FAISS.from_documents(docs_split, embeddings)
121
 
122
+ # --- 7. RAG ์ฒด์ธ ๊ตฌ์„ฑ
123
  qa_chain = RetrievalQA.from_chain_type(
124
  llm=llm,
125
  chain_type="stuff",
126
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
127
  )
128
 
129
+ # --- 8. Gradio ์ธํ„ฐํŽ˜์ด์Šค
130
+ def chatbot(query: str):
131
+ if not query or not query.strip():
132
+ return "์งˆ๋ฌธ์„ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”."
133
  try:
134
+ return qa_chain.run(query)
 
135
  except Exception as e:
136
+ return f"์˜ค๋ฅ˜: {e}"
137
+
138
+ with gr.Blocks() as demo:
139
+ gr.Markdown("## Ericsson ์žฅ๋น„ ๋งค๋‰ด์–ผ RAG ์ฑ—๋ด‡")
140
+ q = gr.Textbox(label="์งˆ๋ฌธ (ํ•œ๊ตญ์–ด/์˜์–ด)")
141
+ out = gr.Textbox(label="์‘๋‹ต", lines=10)
142
+ btn = gr.Button("์งˆ์˜")
143
+ btn.click(chatbot, inputs=q, outputs=out)
 
 
144
 
145
  if __name__ == "__main__":
146
  demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -2,10 +2,8 @@ gradio>=4.0
2
  langchain-community
3
  langchain-huggingface
4
  faiss-cpu
5
- unstructured[all-docs]
6
  sentence-transformers
7
  huggingface_hub
8
- pytesseract
9
- pillow
10
  pandas
11
- pdfplumber # PDF ํ…์ŠคํŠธ ์ถ”์ถœ์šฉ
 
 
2
  langchain-community
3
  langchain-huggingface
4
  faiss-cpu
 
5
  sentence-transformers
6
  huggingface_hub
 
 
7
  pandas
8
+ pillow
9
+ pdfplumber