Spaces:
Build error
Build error
Deploy chatbot update
Browse files
app.py
CHANGED
|
@@ -1,146 +1,100 @@
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
-
import io
|
| 4 |
from datasets import load_dataset
|
| 5 |
-
import pdfplumber
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
from langchain_community.vectorstores import FAISS
|
| 9 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 10 |
from langchain.chains import RetrievalQA
|
| 11 |
import gradio as gr
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
|
| 15 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# --- 1. LLM ์ค์ (ํ์์ ๋ค๋ฅธ ๋ชจ๋ธ๋ก ๋ณ๊ฒฝ)
|
| 20 |
repo_id = "meta-llama/Llama-3.2-3B-Instruct"
|
| 21 |
-
llm = HuggingFaceEndpoint(
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
# --- 3. ๋ฐ์ดํฐ์
์นผ๋ผ ํ์ธ (๋๋ฒ๊ทธ ๋ก๊ทธ)
|
| 28 |
-
print("Dataset columns:", dataset.column_names)
|
| 29 |
-
if len(dataset) > 0:
|
| 30 |
-
print("Sample record keys:", list(dataset[0].keys()))
|
| 31 |
-
|
| 32 |
-
# --- 4. PDF ํ
์คํธ ์ถ์ถ ์ ํธ๋ฆฌํฐ (์ฌ๋ฌ ์ผ์ด์ค ์ฒ๋ฆฌ)
|
| 33 |
-
def extract_text_from_record(record):
|
| 34 |
-
"""
|
| 35 |
-
record: dataset row (dict-like)
|
| 36 |
-
returns: extracted text (str) or None
|
| 37 |
-
์ฒ๋ฆฌ ์ฐ์ ์์:
|
| 38 |
-
1) record['text'] (์ด๋ฏธ OCR/ํ
์คํธ๋ก ์ฌ๋ผ๊ฐ ๊ฒฝ์ฐ)
|
| 39 |
-
2) record['path'] ๋๋ record['file'] ๊ฐ ๋ก์ปฌ ๊ฒฝ๋ก์ผ ๋ ํ์ผ ์ด๊ธฐ
|
| 40 |
-
3) record['file'] ๋๋ record['bytes'] ๊ฐ ๋ฐ์ด๋๋ฆฌ(๋งคํํ)์ผ ๋ BytesIO๋ก ์ด๊ธฐ
|
| 41 |
-
"""
|
| 42 |
-
# 1) ์ด๋ฏธ text ์นผ๋ผ์ด ์์ผ๋ฉด ๋ฐ๋ก ์ฌ์ฉ
|
| 43 |
-
if "text" in record and record["text"]:
|
| 44 |
-
return record["text"]
|
| 45 |
-
|
| 46 |
-
# 2) ๊ฒฝ๋ก ๊ด๋ จ ํ๋ ์ฒดํฌ
|
| 47 |
-
for key in ("path", "file", "filename", "name"):
|
| 48 |
-
if key in record and record[key]:
|
| 49 |
-
val = record[key]
|
| 50 |
-
# datasets ๋ ๋ก์ปฌ ๊ฒฝ๋ก ๋ฌธ์์ด๋ก ์ ๊ณต๋๋ ๊ฒฝ์ฐ
|
| 51 |
-
if isinstance(val, str) and os.path.exists(val):
|
| 52 |
-
try:
|
| 53 |
-
with pdfplumber.open(val) as pdf:
|
| 54 |
-
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 55 |
-
return "
|
| 56 |
-
".join(pages).strip()
|
| 57 |
-
except Exception as e:
|
| 58 |
-
print(f"Failed to open file path {val}: {e}")
|
| 59 |
-
# ๊ณ์ ๋ค์ ์ผ์ด์ค๋ก
|
| 60 |
-
|
| 61 |
-
# ์ผ๋ถ dataset์์ file ํ๋๊ฐ dict ํํ๋ก path ํฌํจํ๋ ๊ฒฝ์ฐ ์ฒ๋ฆฌ
|
| 62 |
-
if isinstance(val, dict):
|
| 63 |
-
# try path inside dict
|
| 64 |
-
inner_path = val.get("path") or val.get("filename")
|
| 65 |
-
if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
|
| 66 |
-
try:
|
| 67 |
-
with pdfplumber.open(inner_path) as pdf:
|
| 68 |
-
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 69 |
-
return "
|
| 70 |
-
".join(pages).strip()
|
| 71 |
-
except Exception as e:
|
| 72 |
-
print(f"Failed to open inner path {inner_path}: {e}")
|
| 73 |
-
|
| 74 |
-
# 3) bytes ํํ(field ์ด๋ฆ์ด 'bytes' ์ด๊ฑฐ๋ file๊ฐ bytes) ์ฒ๋ฆฌ
|
| 75 |
-
for key in ("bytes", "file", "content"):
|
| 76 |
-
if key in record and record[key]:
|
| 77 |
-
val = record[key]
|
| 78 |
-
# datasets may store bytes as a bytes object or memoryview
|
| 79 |
-
if isinstance(val, (bytes, bytearray, memoryview)):
|
| 80 |
-
try:
|
| 81 |
-
bio = io.BytesIO(bytes(val))
|
| 82 |
-
with pdfplumber.open(bio) as pdf:
|
| 83 |
-
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 84 |
-
return "
|
| 85 |
-
".join(pages).strip()
|
| 86 |
-
except Exception as e:
|
| 87 |
-
print(f"Failed to open bytes for key {key}: {e}")
|
| 88 |
-
# sometimes it's a dict with 'bytes' inside
|
| 89 |
-
if isinstance(val, dict) and ("bytes" in val):
|
| 90 |
-
b = val["bytes"]
|
| 91 |
-
try:
|
| 92 |
-
bio = io.BytesIO(bytes(b))
|
| 93 |
-
with pdfplumber.open(bio) as pdf:
|
| 94 |
-
pages = [p.extract_text() or "" for p in pdf.pages]
|
| 95 |
-
return "
|
| 96 |
-
".join(pages).strip()
|
| 97 |
-
except Exception as e:
|
| 98 |
-
print(f"Failed to open nested bytes for key {key}: {e}")
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
-
#
|
| 104 |
docs = []
|
| 105 |
-
for
|
| 106 |
-
|
| 107 |
-
if
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
-
|
| 113 |
-
raise RuntimeError("๋ฌธ์์์ ์ถ์ถ๋ ํ
์คํธ๊ฐ ์์ต๋๋ค. ๋ฐ์ดํฐ์
๊ตฌ์กฐ๋ฅผ ํ์ธํ์ธ์.")
|
| 114 |
|
| 115 |
-
#
|
| 116 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 117 |
-
|
| 118 |
|
|
|
|
| 119 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 120 |
-
vectorstore = FAISS.from_documents(
|
| 121 |
|
| 122 |
-
#
|
| 123 |
qa_chain = RetrievalQA.from_chain_type(
|
| 124 |
llm=llm,
|
| 125 |
chain_type="stuff",
|
| 126 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
|
| 127 |
)
|
| 128 |
|
| 129 |
-
#
|
| 130 |
def chatbot(query: str):
|
| 131 |
-
if not query or not query.strip():
|
| 132 |
-
return "์ง๋ฌธ์ ์
๋ ฅํด ์ฃผ์ธ์."
|
| 133 |
try:
|
| 134 |
-
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
-
return f"์ค๋ฅ: {e}"
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
if __name__ == "__main__":
|
| 146 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
|
| 2 |
import os
|
|
|
|
| 3 |
from datasets import load_dataset
|
|
|
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain_community.vectorstores import FAISS
|
| 7 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 8 |
from langchain.chains import RetrievalQA
|
| 9 |
import gradio as gr
|
| 10 |
+
import pdfplumber
|
| 11 |
|
| 12 |
+
# 1. ํ๊ฒฝ ๋ณ์ ํ์ธ
|
| 13 |
if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
|
| 14 |
+
raise ValueError(
|
| 15 |
+
"โ HUGGINGFACEHUB_API_TOKEN ํ๊ฒฝ ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. "
|
| 16 |
+
"HF Space Settings > Secrets์์ ์ถ๊ฐํ์ธ์."
|
| 17 |
+
)
|
| 18 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 19 |
|
| 20 |
+
# 2. ๋ชจ๋ธ ์ค์ (LLaMA-3.2 3B Instruct)
|
|
|
|
|
|
|
| 21 |
repo_id = "meta-llama/Llama-3.2-3B-Instruct"
|
| 22 |
+
llm = HuggingFaceEndpoint(
|
| 23 |
+
repo_id=repo_id,
|
| 24 |
+
huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
|
| 25 |
+
temperature=0.7,
|
| 26 |
+
task="text-generation"
|
| 27 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# 3. Hugging Face Datasets ๋ก๋
|
| 30 |
+
print("๐ Hugging Face Datasets ๋ก๋ฉ ์ค...")
|
| 31 |
+
dataset = load_dataset("dgmos/ericsson-manuals", split="train")
|
| 32 |
|
| 33 |
+
# 4. PDF โ ํ
์คํธ ์ถ์ถ
|
| 34 |
docs = []
|
| 35 |
+
for item in dataset:
|
| 36 |
+
pdf_path = item.get("file") or item.get("path") or None
|
| 37 |
+
if not pdf_path:
|
| 38 |
+
print(f"โ ๏ธ PDF ๊ฒฝ๋ก ์์: {item}")
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
pages = []
|
| 43 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 44 |
+
for page in pdf.pages:
|
| 45 |
+
content = page.extract_text()
|
| 46 |
+
if content:
|
| 47 |
+
pages.append(content)
|
| 48 |
+
|
| 49 |
+
text = "
|
| 50 |
+
".join(pages).strip()
|
| 51 |
+
if text:
|
| 52 |
+
docs.append({"page_content": text})
|
| 53 |
+
else:
|
| 54 |
+
print(f"โ ๏ธ ํ
์คํธ ์ถ์ถ ์คํจ: {pdf_path}")
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"๐จ PDF ์ฒ๋ฆฌ ์ค๋ฅ: {pdf_path} - {str(e)}")
|
| 58 |
+
continue
|
| 59 |
|
| 60 |
+
print(f"โ
์ด {len(docs)} ๊ฐ PDF์์ ํ
์คํธ ์ถ์ถ ์๋ฃ")
|
|
|
|
| 61 |
|
| 62 |
+
# 5. ํ
์คํธ ๋ถํ
|
| 63 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 64 |
+
texts = splitter.split_documents(docs)
|
| 65 |
|
| 66 |
+
# 6. ์๋ฒ ๋ฉ + ๋ฒกํฐ DB
|
| 67 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 68 |
+
vectorstore = FAISS.from_documents(texts, embeddings)
|
| 69 |
|
| 70 |
+
# 7. Retrieval QA ์ฒด์ธ
|
| 71 |
qa_chain = RetrievalQA.from_chain_type(
|
| 72 |
llm=llm,
|
| 73 |
chain_type="stuff",
|
| 74 |
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# 8. ์ฑ๋ด ํจ์
|
| 78 |
def chatbot(query: str):
|
|
|
|
|
|
|
| 79 |
try:
|
| 80 |
+
response = qa_chain.run(query)
|
| 81 |
+
return response
|
| 82 |
except Exception as e:
|
| 83 |
+
return f"โ ์ค๋ฅ ๋ฐ์: {str(e)}"
|
| 84 |
+
|
| 85 |
+
# 9. Gradio UI
|
| 86 |
+
with gr.Blocks(title="Ericsson ์ฅ๋น ๋ถ์ ์ฑ๋ด") as demo:
|
| 87 |
+
gr.Markdown("# ๐ 3G/LTE/5G ์ฅ๋น ๋ถ๋/๋ถ์ํ ๋ถ์ ์ฑ๋ด")
|
| 88 |
+
gr.Markdown("Hugging Face Datasets(`dgmos/ericsson-manuals`)์ ์
๋ก๋๋ **OCR PDF ๋งค๋ด์ผ**์ ๊ธฐ๋ฐ์ผ๋ก ์ง์์๋ต์ ์ ๊ณตํฉ๋๋ค.")
|
| 89 |
+
|
| 90 |
+
query = gr.Textbox(
|
| 91 |
+
label="์ง๋ฌธ ์
๋ ฅ (ํ๊ตญ์ด/์์ด)",
|
| 92 |
+
placeholder="์: Spurious Emission ์์ธ์?",
|
| 93 |
+
)
|
| 94 |
+
output = gr.Textbox(label="์๋ต", lines=10)
|
| 95 |
+
btn = gr.Button("๋ถ์ ์์!")
|
| 96 |
+
|
| 97 |
+
btn.click(chatbot, inputs=query, outputs=output)
|
| 98 |
|
| 99 |
if __name__ == "__main__":
|
| 100 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|