Spaces:

dgmos
/

ericsson-llm-chatbot

Build error

App Files Files Community

dgmos commited on Oct 3, 2025

Commit

ad0b19c

1 Parent(s): 6d76ee3

Deploy chatbot update

Browse files

Files changed (1) hide show

app.py +66 -112

app.py CHANGED Viewed

@@ -1,146 +1,100 @@
 import os
-import io
 from datasets import load_dataset
-import pdfplumber
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEndpoint
 from langchain.chains import RetrievalQA
 import gradio as gr
-# --- 0. 필수: HF 토큰이 Space Secrets에 설정되어 있어야 합니다.
 if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
-    raise RuntimeError("HUGGINGFACEHUB_API_TOKEN 환경 변수가 없습니다. Space Settings → Repository secrets 에 추가하세요.")
-HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
-# --- 1. LLM 설정 (필요시 다른 모델로 변경)
 repo_id = "meta-llama/Llama-3.2-3B-Instruct"
-llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=HF_TOKEN, temperature=0.2, task="text-generation")
-# --- 2. 데이터셋 로드
-ds_name = "dgmos/ericsson-manuals"
-dataset = load_dataset(ds_name, split="train")  # train split 사용 (업로드한 파일은 보통 train에 있음)
-# --- 3. 데이터셋 칼럼 확인 (디버그 로그)
-print("Dataset columns:", dataset.column_names)
-if len(dataset) > 0:
-    print("Sample record keys:", list(dataset[0].keys()))
-# --- 4. PDF 텍스트 추출 유틸리티 (여러 케이스 처리)
-def extract_text_from_record(record):
-    """
-    record: dataset row (dict-like)
-    returns: extracted text (str) or None
-    처리 우선순위:
-      1) record['text'] (이미 OCR/텍스트로 올라간 경우)
-      2) record['path'] 또는 record['file'] 가 로컬 경로일 때 파일 열기
-      3) record['file'] 또는 record['bytes'] 가 바이너리(매핑형)일 때 BytesIO로 열기
-    """
-    # 1) 이미 text 칼럼이 있으면 바로 사용
-    if "text" in record and record["text"]:
-        return record["text"]
-    # 2) 경로 관련 필드 체크
-    for key in ("path", "file", "filename", "name"):
-        if key in record and record[key]:
-            val = record[key]
-            # datasets 때 로컬 경로 문자열로 제공되는 경우
-            if isinstance(val, str) and os.path.exists(val):
-                try:
-                    with pdfplumber.open(val) as pdf:
-                        pages = [p.extract_text() or "" for p in pdf.pages]
-                    return "
-".join(pages).strip()
-                except Exception as e:
-                    print(f"Failed to open file path {val}: {e}")
-                    # 계속 다음 케이스로
-            # 일부 dataset에서 file 필드가 dict 형태로 path 포함하는 경우 처리
-            if isinstance(val, dict):
-                # try path inside dict
-                inner_path = val.get("path") or val.get("filename")
-                if inner_path and isinstance(inner_path, str) and os.path.exists(inner_path):
-                    try:
-                        with pdfplumber.open(inner_path) as pdf:
-                            pages = [p.extract_text() or "" for p in pdf.pages]
-                        return "
-".join(pages).strip()
-                    except Exception as e:
-                        print(f"Failed to open inner path {inner_path}: {e}")
-    # 3) bytes 형태(field 이름이 'bytes' 이거나 file가 bytes) 처리
-    for key in ("bytes", "file", "content"):
-        if key in record and record[key]:
-            val = record[key]
-            # datasets may store bytes as a bytes object or memoryview
-            if isinstance(val, (bytes, bytearray, memoryview)):
-                try:
-                    bio = io.BytesIO(bytes(val))
-                    with pdfplumber.open(bio) as pdf:
-                        pages = [p.extract_text() or "" for p in pdf.pages]
-                    return "
-".join(pages).strip()
-                except Exception as e:
-                    print(f"Failed to open bytes for key {key}: {e}")
-            # sometimes it's a dict with 'bytes' inside
-            if isinstance(val, dict) and ("bytes" in val):
-                b = val["bytes"]
-                try:
-                    bio = io.BytesIO(bytes(b))
-                    with pdfplumber.open(bio) as pdf:
-                        pages = [p.extract_text() or "" for p in pdf.pages]
-                    return "
-".join(pages).strip()
-                except Exception as e:
-                    print(f"Failed to open nested bytes for key {key}: {e}")
-    # 못 찾음
-    return None
-# --- 5. 모든 레코드에서 텍스트 추출 (주의: 파일 수/크기 많으면 시간 소요)
 docs = []
-for i, rec in enumerate(dataset):
-    text = extract_text_from_record(rec)
-    if text:
-        docs.append({"page_content": text})
-    else:
-        print(f"⚠️ 레코드 {i}에서 텍스트를 추출하지 못했습니다. keys={list(rec.keys())}")
-if not docs:
-    raise RuntimeError("문서에서 추출된 텍스트가 없습니다. 데이터셋 구조를 확인하세요.")
-# --- 6. 텍스트 분할 및 임베딩
 splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-docs_split = splitter.split_documents(docs)
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
-vectorstore = FAISS.from_documents(docs_split, embeddings)
-# --- 7. RAG 체인 구성
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     chain_type="stuff",
     retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
 )
-# --- 8. Gradio 인터페이스
 def chatbot(query: str):
-    if not query or not query.strip():
-        return "질문을 입력해 주세요."
     try:
-        return qa_chain.run(query)
     except Exception as e:
-        return f"오류: {e}"
-with gr.Blocks() as demo:
-    gr.Markdown("## Ericsson 장비 매뉴얼 RAG 챗봇")
-    q = gr.Textbox(label="질문 (한국어/영어)")
-    out = gr.Textbox(label="응답", lines=10)
-    btn = gr.Button("질의")
-    btn.click(chatbot, inputs=q, outputs=out)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 from datasets import load_dataset
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEndpoint
 from langchain.chains import RetrievalQA
 import gradio as gr
+import pdfplumber
+# 1. 환경 변수 확인
 if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
+    raise ValueError(
+        "❌ HUGGINGFACEHUB_API_TOKEN 환경 변수가 설정되지 않았습니다. "
+        "HF Space Settings > Secrets에서 추가하세요."
+    )
+os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# 2. 모델 설정 (LLaMA-3.2 3B Instruct)
 repo_id = "meta-llama/Llama-3.2-3B-Instruct"
+llm = HuggingFaceEndpoint(
+    repo_id=repo_id,
+    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
+    temperature=0.7,
+    task="text-generation"
+)
+# 3. Hugging Face Datasets 로드
+print("📂 Hugging Face Datasets 로딩 중...")
+dataset = load_dataset("dgmos/ericsson-manuals", split="train")
+# 4. PDF → 텍스트 추출
 docs = []
+for item in dataset:
+    pdf_path = item.get("file") or item.get("path") or None
+    if not pdf_path:
+        print(f"⚠️ PDF 경로 없음: {item}")
+        continue
+    try:
+        pages = []
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                content = page.extract_text()
+                if content:
+                    pages.append(content)
+        text = "
+".join(pages).strip()
+        if text:
+            docs.append({"page_content": text})
+        else:
+            print(f"⚠️ 텍스트 추출 실패: {pdf_path}")
+    except Exception as e:
+        print(f"🚨 PDF 처리 오류: {pdf_path} - {str(e)}")
+        continue
+print(f"✅ 총 {len(docs)} 개 PDF에서 텍스트 추출 완료")
+# 5. 텍스트 분할
 splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+texts = splitter.split_documents(docs)
+# 6. 임베딩 + 벡터 DB
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+vectorstore = FAISS.from_documents(texts, embeddings)
+# 7. Retrieval QA 체인
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     chain_type="stuff",
     retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
 )
+# 8. 챗봇 함수
 def chatbot(query: str):
     try:
+        response = qa_chain.run(query)
+        return response
     except Exception as e:
+        return f"❌ 오류 발생: {str(e)}"
+# 9. Gradio UI
+with gr.Blocks(title="Ericsson 장비 분석 챗봇") as demo:
+    gr.Markdown("# 🚀 3G/LTE/5G 장비 불량/불요파 분석 챗봇")
+    gr.Markdown("Hugging Face Datasets(`dgmos/ericsson-manuals`)에 업로드된 **OCR PDF 매뉴얼**을 기반으로 질의응답을 제공합니다.")
+    query = gr.Textbox(
+        label="질문 입력 (한국어/영어)",
+        placeholder="예: Spurious Emission 원인은?",
+    )
+    output = gr.Textbox(label="응답", lines=10)
+    btn = gr.Button("분석 시작!")
+    btn.click(chatbot, inputs=query, outputs=output)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)