Spaces:
Build error
Build error
Deploy chatbot update
Browse files- app.py +113 -48
- requirements.txt +2 -4
app.py
CHANGED
|
@@ -1,81 +1,146 @@
|
|
| 1 |
|
| 2 |
import os
|
|
|
|
| 3 |
from datasets import load_dataset
|
|
|
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain_community.vectorstores import FAISS
|
| 7 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 8 |
from langchain.chains import RetrievalQA
|
| 9 |
import gradio as gr
|
| 10 |
-
import pdfplumber # PDF ํ
์คํธ ์ถ์ถ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
|
| 11 |
|
| 12 |
-
#
|
| 13 |
if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
|
| 14 |
-
raise
|
| 15 |
-
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
repo_id = "meta-llama/Llama-3.2-3B-Instruct"
|
| 19 |
-
llm = HuggingFaceEndpoint(
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
docs = []
|
| 31 |
-
for
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
text = "
|
| 41 |
-
".join([page.extract_text() or "" for page in pdf.pages])
|
| 42 |
-
if text.strip():
|
| 43 |
-
docs.append({"page_content": text})
|
| 44 |
-
except Exception as e:
|
| 45 |
-
print(f"PDF ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {pdf_path} - {str(e)}")
|
| 46 |
-
continue
|
| 47 |
|
| 48 |
-
#
|
| 49 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 50 |
-
|
| 51 |
|
| 52 |
-
# 6. ์๋ฒ ๋ฉ ๋ฐ ๋ฒกํฐ DB ์์ฑ
|
| 53 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 54 |
-
vectorstore = FAISS.from_documents(
|
| 55 |
|
| 56 |
-
# 7. RAG ์ฒด์ธ
|
| 57 |
qa_chain = RetrievalQA.from_chain_type(
|
| 58 |
llm=llm,
|
| 59 |
chain_type="stuff",
|
| 60 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
|
| 61 |
)
|
| 62 |
|
| 63 |
-
# 8.
|
| 64 |
-
def chatbot(query):
|
|
|
|
|
|
|
| 65 |
try:
|
| 66 |
-
|
| 67 |
-
return response
|
| 68 |
except Exception as e:
|
| 69 |
-
return f"์ค๋ฅ: {
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
gr.
|
| 74 |
-
gr.
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
btn = gr.Button("๋ถ์ ์์!")
|
| 78 |
-
btn.click(chatbot, inputs=query, outputs=output)
|
| 79 |
|
| 80 |
if __name__ == "__main__":
|
| 81 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
+
import io
|
| 4 |
from datasets import load_dataset
|
| 5 |
+
import pdfplumber
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
from langchain_community.vectorstores import FAISS
|
| 9 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 10 |
from langchain.chains import RetrievalQA
|
| 11 |
import gradio as gr
|
|
|
|
| 12 |
|
| 13 |
+
# --- 0. ํ์: HF ํ ํฐ์ด Space Secrets์ ์ค์ ๋์ด ์์ด์ผ ํฉ๋๋ค.
|
| 14 |
if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
|
| 15 |
+
raise RuntimeError("HUGGINGFACEHUB_API_TOKEN ํ๊ฒฝ ๋ณ์๊ฐ ์์ต๋๋ค. Space Settings โ Repository secrets ์ ์ถ๊ฐํ์ธ์.")
|
|
|
|
| 16 |
|
| 17 |
+
HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
| 18 |
+
|
| 19 |
+
# --- 1. LLM ์ค์ (ํ์์ ๋ค๋ฅธ ๋ชจ๋ธ๋ก ๋ณ๊ฒฝ)
|
| 20 |
repo_id = "meta-llama/Llama-3.2-3B-Instruct"
|
| 21 |
+
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=HF_TOKEN, temperature=0.2, task="text-generation")
|
| 22 |
+
|
| 23 |
+
# --- 2. ๋ฐ์ดํฐ์
๋ก๋
|
| 24 |
+
ds_name = "dgmos/ericsson-manuals"
|
| 25 |
+
dataset = load_dataset(ds_name, split="train") # train split ์ฌ์ฉ (์
๋ก๋ํ ํ์ผ์ ๋ณดํต train์ ์์)
|
| 26 |
+
|
| 27 |
+
# --- 3. ๋ฐ์ดํฐ์
์นผ๋ผ ํ์ธ (๋๋ฒ๊ทธ ๋ก๊ทธ)
|
| 28 |
+
print("Dataset columns:", dataset.column_names)
|
| 29 |
+
if len(dataset) > 0:
|
| 30 |
+
print("Sample record keys:", list(dataset[0].keys()))
|
| 31 |
+
|
| 32 |
+
# --- 4. PDF ํ
์คํธ ์ถ์ถ ์ ํธ๋ฆฌํฐ (์ฌ๋ฌ ์ผ์ด์ค ์ฒ๋ฆฌ)
|
| 33 |
+
def extract_text_from_record(record):
|
| 34 |
+
"""
|
| 35 |
+
record: dataset row (dict-like)
|
| 36 |
+
returns: extracted text (str) or None
|
| 37 |
+
์ฒ๋ฆฌ ์ฐ์ ์์:
|
| 38 |
+
1) record['text'] (์ด๋ฏธ OCR/ํ
์คํธ๋ก ์ฌ๋ผ๊ฐ ๊ฒฝ์ฐ)
|
| 39 |
+
2) record['path'] ๋๋ record['file'] ๊ฐ ๋ก์ปฌ ๊ฒฝ๋ก์ผ ๋ ํ์ผ ์ด๊ธฐ
|
| 40 |
+
3) record['file'] ๋๋ record['bytes'] ๊ฐ ๋ฐ์ด๋๋ฆฌ(๋งคํํ)์ผ ๋ BytesIO๋ก ์ด๊ธฐ
|
| 41 |
+
"""
|
| 42 |
+
# 1) ์ด๋ฏธ text ์นผ๋ผ์ด ์์ผ๋ฉด ๋ฐ๋ก ์ฌ์ฉ
|
| 43 |
+
if "text" in record and record["text"]:
|
| 44 |
+
return record["text"]
|
| 45 |
+
|
| 46 |
+
# 2) ๊ฒฝ๋ก ๊ด๋ จ ํ๋ ์ฒดํฌ
|
| 47 |
+
for key in ("path", "file", "filename", "name"):
|
| 48 |
+
if key in record and record[key]:
|
| 49 |
+
val = record[key]
|
| 50 |
+
# datasets ๋ ๋ก์ปฌ ๊ฒฝ๋ก ๋ฌธ์์ด๋ก ์ ๊ณต๋๋ ๊ฒฝ์ฐ
|
| 51 |
+
if isinstance(val, str) and os.path.exists(val):
|
| 52 |
+
try:
|
| 53 |
+
with pdfplumber.open(val) as pdf:
|
| 54 |
+
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 55 |
+
return "
|
| 56 |
+
".join(pages).strip()
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Failed to open file path {val}: {e}")
|
| 59 |
+
# ๊ณ์ ๋ค์ ์ผ์ด์ค๋ก
|
| 60 |
|
| 61 |
+
# ์ผ๋ถ dataset์์ file ํ๋๊ฐ dict ํํ๋ก path ํฌํจํ๋ ๊ฒฝ์ฐ ์ฒ๋ฆฌ
|
| 62 |
+
if isinstance(val, dict):
|
| 63 |
+
# try path inside dict
|
| 64 |
+
inner_path = val.get("path") or val.get("filename")
|
| 65 |
+
if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
|
| 66 |
+
try:
|
| 67 |
+
with pdfplumber.open(inner_path) as pdf:
|
| 68 |
+
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 69 |
+
return "
|
| 70 |
+
".join(pages).strip()
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Failed to open inner path {inner_path}: {e}")
|
| 73 |
|
| 74 |
+
# 3) bytes ํํ(field ์ด๋ฆ์ด 'bytes' ์ด๊ฑฐ๋ file๊ฐ bytes) ์ฒ๋ฆฌ
|
| 75 |
+
for key in ("bytes", "file", "content"):
|
| 76 |
+
if key in record and record[key]:
|
| 77 |
+
val = record[key]
|
| 78 |
+
# datasets may store bytes as a bytes object or memoryview
|
| 79 |
+
if isinstance(val, (bytes, bytearray, memoryview)):
|
| 80 |
+
try:
|
| 81 |
+
bio = io.BytesIO(bytes(val))
|
| 82 |
+
with pdfplumber.open(bio) as pdf:
|
| 83 |
+
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 84 |
+
return "
|
| 85 |
+
".join(pages).strip()
|
| 86 |
+
except Exception as e:
|
| 87 |
+
print(f"Failed to open bytes for key {key}: {e}")
|
| 88 |
+
# sometimes it's a dict with 'bytes' inside
|
| 89 |
+
if isinstance(val, dict) and ("bytes" in val):
|
| 90 |
+
b = val["bytes"]
|
| 91 |
+
try:
|
| 92 |
+
bio = io.BytesIO(bytes(b))
|
| 93 |
+
with pdfplumber.open(bio) as pdf:
|
| 94 |
+
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 95 |
+
return "
|
| 96 |
+
".join(pages).strip()
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"Failed to open nested bytes for key {key}: {e}")
|
| 99 |
+
|
| 100 |
+
# ๋ชป ์ฐพ์
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
# --- 5. ๋ชจ๋ ๋ ์ฝ๋์์ ํ
์คํธ ์ถ์ถ (์ฃผ์: ํ์ผ ์/ํฌ๊ธฐ ๋ง์ผ๋ฉด ์๊ฐ ์์)
|
| 104 |
docs = []
|
| 105 |
+
for i, rec in enumerate(dataset):
|
| 106 |
+
text = extract_text_from_record(rec)
|
| 107 |
+
if text:
|
| 108 |
+
docs.append({"page_content": text})
|
| 109 |
+
else:
|
| 110 |
+
print(f"โ ๏ธ ๋ ์ฝ๋ {i}์์ ํ
์คํธ๋ฅผ ์ถ์ถํ์ง ๋ชปํ์ต๋๋ค. keys={list(rec.keys())}")
|
| 111 |
|
| 112 |
+
if not docs:
|
| 113 |
+
raise RuntimeError("๋ฌธ์์์ ์ถ์ถ๋ ํ
์คํธ๊ฐ ์์ต๋๋ค. ๋ฐ์ดํฐ์
๊ตฌ์กฐ๋ฅผ ํ์ธํ์ธ์.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# --- 6. ํ
์คํธ ๋ถํ ๋ฐ ์๋ฒ ๋ฉ
|
| 116 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 117 |
+
docs_split = splitter.split_documents(docs)
|
| 118 |
|
|
|
|
| 119 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 120 |
+
vectorstore = FAISS.from_documents(docs_split, embeddings)
|
| 121 |
|
| 122 |
+
# --- 7. RAG ์ฒด์ธ ๊ตฌ์ฑ
|
| 123 |
qa_chain = RetrievalQA.from_chain_type(
|
| 124 |
llm=llm,
|
| 125 |
chain_type="stuff",
|
| 126 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
|
| 127 |
)
|
| 128 |
|
| 129 |
+
# --- 8. Gradio ์ธํฐํ์ด์ค
|
| 130 |
+
def chatbot(query: str):
|
| 131 |
+
if not query or not query.strip():
|
| 132 |
+
return "์ง๋ฌธ์ ์
๋ ฅํด ์ฃผ์ธ์."
|
| 133 |
try:
|
| 134 |
+
return qa_chain.run(query)
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
+
return f"์ค๋ฅ: {e}"
|
| 137 |
+
|
| 138 |
+
with gr.Blocks() as demo:
|
| 139 |
+
gr.Markdown("## Ericsson ์ฅ๋น ๋งค๋ด์ผ RAG ์ฑ๋ด")
|
| 140 |
+
q = gr.Textbox(label="์ง๋ฌธ (ํ๊ตญ์ด/์์ด)")
|
| 141 |
+
out = gr.Textbox(label="์๋ต", lines=10)
|
| 142 |
+
btn = gr.Button("์ง์")
|
| 143 |
+
btn.click(chatbot, inputs=q, outputs=out)
|
|
|
|
|
|
|
| 144 |
|
| 145 |
if __name__ == "__main__":
|
| 146 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
CHANGED
|
@@ -2,10 +2,8 @@ gradio>=4.0
|
|
| 2 |
langchain-community
|
| 3 |
langchain-huggingface
|
| 4 |
faiss-cpu
|
| 5 |
-
unstructured[all-docs]
|
| 6 |
sentence-transformers
|
| 7 |
huggingface_hub
|
| 8 |
-
pytesseract
|
| 9 |
-
pillow
|
| 10 |
pandas
|
| 11 |
-
|
|
|
|
|
|
| 2 |
langchain-community
|
| 3 |
langchain-huggingface
|
| 4 |
faiss-cpu
|
|
|
|
| 5 |
sentence-transformers
|
| 6 |
huggingface_hub
|
|
|
|
|
|
|
| 7 |
pandas
|
| 8 |
+
pillow
|
| 9 |
+
pdfplumber
|