Spaces:

dgmos
/

ericsson-llm-chatbot

Build error

App Files Files Community

dgmos commited on Oct 3, 2025

Commit

6d76ee3

1 Parent(s): df786c4

Deploy chatbot update

Browse files

Files changed (2) hide show

app.py +113 -48
requirements.txt +2 -4

app.py CHANGED Viewed

@@ -1,81 +1,146 @@
 import os
 from datasets import load_dataset
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEndpoint
 from langchain.chains import RetrievalQA
 import gradio as gr
-import pdfplumber  # PDF 텍스트 추출 라이브러리
-# 1. 환경 변수 설정
 if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
-    raise ValueError("HUGGINGFACEHUB_API_TOKEN 환경 변수가 설정되지 않았습니다. HF Space Settings > Secrets에서 추가하세요.")
-os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-# 2. 모델 설정
 repo_id = "meta-llama/Llama-3.2-3B-Instruct"
-llm = HuggingFaceEndpoint(
-    repo_id=repo_id,
-    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
-    temperature=0.7,
-    task="text-generation"
-)
-# 3. Hugging Face Dataset 로드
-dataset = load_dataset("dgmos/ericsson-manuals", split="train")
-# 4. PDF 텍스트 추출
 docs = []
-for item in dataset:
-    # 실제 필드명 확인 필요 (예: "file", "path" 등)
-    pdf_path = item.get("file") or item.get("path") or None
-    if not pdf_path:
-        print(f"⚠️ PDF 경로가 없음: {item}")
-        continue
-    try:
-        with pdfplumber.open(pdf_path) as pdf:
-            text = "
-".join([page.extract_text() or "" for page in pdf.pages])
-            if text.strip():
-                docs.append({"page_content": text})
-    except Exception as e:
-        print(f"PDF 처리 중 오류: {pdf_path} - {str(e)}")
-        continue
-# 5. 텍스트 분할
 splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-texts = splitter.split_documents(docs)
-# 6. 임베딩 및 벡터 DB 생성
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
-vectorstore = FAISS.from_documents(texts, embeddings)
-# 7. RAG 체인 생성
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     chain_type="stuff",
     retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
 )
-# 8. 챗봇 함수
-def chatbot(query):
     try:
-        response = qa_chain.run(query)
-        return response
     except Exception as e:
-        return f"오류: {str(e)}."
-# 9. Gradio UI
-with gr.Blocks(title="Ericsson 장비 분석 챗봇") as demo:
-    gr.Markdown("# 🚀 3G/LTE/5G 장비 불량/불요파 분석 챗봇")
-    gr.Markdown("Hugging Face Dataset에서 로드한 PDF를 기반으로 질문만 입력하세요!")
-    query = gr.Textbox(label="질문 (한국어/영어)", placeholder="Spurious Emission 원인은?")
-    output = gr.Textbox(label="응답", lines=10)
-    btn = gr.Button("분석 시작!")
-    btn.click(chatbot, inputs=query, outputs=output)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
+import io
 from datasets import load_dataset
+import pdfplumber
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEndpoint
 from langchain.chains import RetrievalQA
 import gradio as gr
+# --- 0. 필수: HF 토큰이 Space Secrets에 설정되어 있어야 합니다.
 if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
+    raise RuntimeError("HUGGINGFACEHUB_API_TOKEN 환경 변수가 없습니다. Space Settings → Repository secrets 에 추가하세요.")
+HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
+# --- 1. LLM 설정 (필요시 다른 모델로 변경)
 repo_id = "meta-llama/Llama-3.2-3B-Instruct"
+llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=HF_TOKEN, temperature=0.2, task="text-generation")
+# --- 2. 데이터셋 로드
+ds_name = "dgmos/ericsson-manuals"
+dataset = load_dataset(ds_name, split="train")  # train split 사용 (업로드한 파일은 보통 train에 있음)
+# --- 3. 데이터셋 칼럼 확인 (디버그 로그)
+print("Dataset columns:", dataset.column_names)
+if len(dataset) > 0:
+    print("Sample record keys:", list(dataset[0].keys()))
+# --- 4. PDF 텍스트 추출 유틸리티 (여러 케이스 처리)
+def extract_text_from_record(record):
+    """
+    record: dataset row (dict-like)
+    returns: extracted text (str) or None
+    처리 우선순위:
+      1) record['text'] (이미 OCR/텍스트로 올라간 경우)
+      2) record['path'] 또는 record['file'] 가 로컬 경로일 때 파일 열기
+      3) record['file'] 또는 record['bytes'] 가 바이너리(매핑형)일 때 BytesIO로 열기
+    """
+    # 1) 이미 text 칼럼이 있으면 바로 사용
+    if "text" in record and record["text"]:
+        return record["text"]
+    # 2) 경로 관련 필드 체크
+    for key in ("path", "file", "filename", "name"):
+        if key in record and record[key]:
+            val = record[key]
+            # datasets 때 로컬 경로 문자열로 제공되는 경우
+            if isinstance(val, str) and os.path.exists(val):
+                try:
+                    with pdfplumber.open(val) as pdf:
+                        pages = [p.extract_text() or "" for p in pdf.pages]
+                    return "
+".join(pages).strip()
+                except Exception as e:
+                    print(f"Failed to open file path {val}: {e}")
+                    # 계속 다음 케이스로
+            # 일부 dataset에서 file 필드가 dict 형태로 path 포함하는 경우 처리
+            if isinstance(val, dict):
+                # try path inside dict
+                inner_path = val.get("path") or val.get("filename")
+                if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
+                    try:
+                        with pdfplumber.open(inner_path) as pdf:
+                            pages = [p.extract_text() or "" for p in pdf.pages]
+                        return "
+".join(pages).strip()
+                    except Exception as e:
+                        print(f"Failed to open inner path {inner_path}: {e}")
+    # 3) bytes 형태(field 이름이 'bytes' 이거나 file가 bytes) 처리
+    for key in ("bytes", "file", "content"):
+        if key in record and record[key]:
+            val = record[key]
+            # datasets may store bytes as a bytes object or memoryview
+            if isinstance(val, (bytes, bytearray, memoryview)):
+                try:
+                    bio = io.BytesIO(bytes(val))
+                    with pdfplumber.open(bio) as pdf:
+                        pages = [p.extract_text() or "" for p in pdf.pages]
+                    return "
+".join(pages).strip()
+                except Exception as e:
+                    print(f"Failed to open bytes for key {key}: {e}")
+            # sometimes it's a dict with 'bytes' inside
+            if isinstance(val, dict) and ("bytes" in val):
+                b = val["bytes"]
+                try:
+                    bio = io.BytesIO(bytes(b))
+                    with pdfplumber.open(bio) as pdf:
+                        pages = [p.extract_text() or "" for p in pdf.pages]
+                    return "
+".join(pages).strip()
+                except Exception as e:
+                    print(f"Failed to open nested bytes for key {key}: {e}")
+    # 못 찾음
+    return None
+# --- 5. 모든 레코드에서 텍스트 추출 (주의: 파일 수/크기 많으면 시간 소요)
 docs = []
+for i, rec in enumerate(dataset):
+    text = extract_text_from_record(rec)
+    if text:
+        docs.append({"page_content": text})
+    else:
+        print(f"⚠️ 레코드 {i}에서 텍스트를 추출하지 못했습니다. keys={list(rec.keys())}")
+if not docs:
+    raise RuntimeError("문서에서 추출된 텍스트가 없습니다. 데이터셋 구조를 확인하세요.")
+# --- 6. 텍스트 분할 및 임베딩
 splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+docs_split = splitter.split_documents(docs)
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+vectorstore = FAISS.from_documents(docs_split, embeddings)
+# --- 7. RAG 체인 구성
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     chain_type="stuff",
     retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
 )
+# --- 8. Gradio 인터페이스
+def chatbot(query: str):
+    if not query or not query.strip():
+        return "질문을 입력해 주세요."
     try:
+        return qa_chain.run(query)
     except Exception as e:
+        return f"오류: {e}"
+with gr.Blocks() as demo:
+    gr.Markdown("## Ericsson 장비 매뉴얼 RAG 챗봇")
+    q = gr.Textbox(label="질문 (한국어/영어)")
+    out = gr.Textbox(label="응답", lines=10)
+    btn = gr.Button("질의")
+    btn.click(chatbot, inputs=q, outputs=out)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -2,10 +2,8 @@ gradio>=4.0
 langchain-community
 langchain-huggingface
 faiss-cpu
-unstructured[all-docs]
 sentence-transformers
 huggingface_hub
-pytesseract
-pillow
 pandas
-pdfplumber  # PDF 텍스트 추출용

 langchain-community
 langchain-huggingface
 faiss-cpu
 sentence-transformers
 huggingface_hub
 pandas
+pillow
+pdfplumber