Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

openfree commited on Aug 27, 2025

Commit

ecc2bb4

verified ·

1 Parent(s): 4fd4c0d

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -8

app.py CHANGED Viewed

@@ -48,6 +48,21 @@ except ImportError:
     BIOPYTHON_AVAILABLE = False
     print("[WARNING] biopython not available")
 # 상수
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
 DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
@@ -126,9 +141,41 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
         return f"[LLM Error] {e}"
 def load_file_text(upload) -> str:
-    """Load text from uploaded file"""
     name = upload.name.lower()
     try:
         content = upload.read()
         text = content.decode("utf-8", errors="ignore")
@@ -415,8 +462,8 @@ tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
 # File upload
 with st.expander("📁 Upload Files", expanded=True):
     files = st.file_uploader(
-        "Upload text/FASTA files",
-        type=["txt", "fa", "fasta", "csv", "json"],
         accept_multiple_files=True
     )
@@ -424,18 +471,25 @@ with st.expander("📁 Upload Files", expanded=True):
         docs = []
         for f in files:
             try:
                 text = load_file_text(f)
                 if text:
                     docs.extend(chunk_text(text))
             except Exception as e:
                 st.error(f"Error reading {f.name}: {e}")
         if docs:
             st.session_state.docs = docs
-            st.success(f"Loaded {len(docs)} chunks")
             if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
-                with st.spinner("Building index..."):
                     index, model = build_index(docs)
                     if index:
                         st.session_state.index = index
@@ -677,11 +731,11 @@ with tab4:
     - 🧬 Protein sequence analysis with ESM-2
     - 🧬 DNA sequence analysis with DNABERT-2
     - 🔍 Web search integration via Brave API
-    - 📁 File upload and vector search
     ### Models
     - **Proteins:** ESM-2 (Facebook)
-    - **DNA:** DNABERT-2 (Microsoft)
     - **LLM:** Llama 3.1 70B (via Fireworks)
     ### Disclaimer
@@ -698,7 +752,9 @@ with tab4:
         "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
         "FAISS": FAISS_AVAILABLE,
         "BioPython": BIOPYTHON_AVAILABLE,
-        "Datasets": DATASETS_AVAILABLE
     }
     for name, available in deps.items():

     BIOPYTHON_AVAILABLE = False
     print("[WARNING] biopython not available")
+# PDF 지원 라이브러리
+try:
+    import pdfplumber
+    PDFPLUMBER_AVAILABLE = True
+except ImportError:
+    PDFPLUMBER_AVAILABLE = False
+    print("[WARNING] pdfplumber not available")
+try:
+    import PyPDF2
+    PYPDF2_AVAILABLE = True
+except ImportError:
+    PYPDF2_AVAILABLE = False
+    print("[WARNING] PyPDF2 not available")
 # 상수
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
 DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
         return f"[LLM Error] {e}"
 def load_file_text(upload) -> str:
+    """Load text from uploaded file (PDF 지원 포함)"""
     name = upload.name.lower()
+    # PDF 처리
+    if name.endswith(".pdf"):
+        if PDFPLUMBER_AVAILABLE:
+            try:
+                text_parts = []
+                with pdfplumber.open(upload) as pdf:
+                    for page in pdf.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text_parts.append(page_text)
+                return "\n\n".join(text_parts)
+            except Exception as e:
+                st.error(f"PDF 읽기 오류 (pdfplumber): {e}")
+                return ""
+        elif PYPDF2_AVAILABLE:
+            try:
+                upload.seek(0)
+                pdf_reader = PyPDF2.PdfReader(upload)
+                text_parts = []
+                for page_num in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[page_num]
+                    text_parts.append(page.extract_text())
+                return "\n\n".join(text_parts)
+            except Exception as e:
+                st.error(f"PDF 읽기 오류 (PyPDF2): {e}")
+                return ""
+        else:
+            st.error("PDF 파일을 읽으려면 pdfplumber 또는 PyPDF2가 필요합니다")
+            return ""
+    # 기존 텍스트 파일 처리
     try:
         content = upload.read()
         text = content.decode("utf-8", errors="ignore")
 # File upload
 with st.expander("📁 Upload Files", expanded=True):
     files = st.file_uploader(
+        "Upload text/FASTA/PDF files",  # PDF 추가
+        type=["txt", "fa", "fasta", "csv", "json", "pdf"],  # PDF 추가
         accept_multiple_files=True
     )
         docs = []
         for f in files:
             try:
+                # PDF 파일인 경우 경고 메시지 추가
+                if f.name.lower().endswith(".pdf"):
+                    if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
+                        st.warning(f"⚠️ PDF 지원을 위해 pdfplumber 설치 필요: pip install pdfplumber")
+                        continue
                 text = load_file_text(f)
                 if text:
                     docs.extend(chunk_text(text))
+                    st.success(f"✅ {f.name} 로드 완료")
             except Exception as e:
                 st.error(f"Error reading {f.name}: {e}")
         if docs:
             st.session_state.docs = docs
+            st.success(f"총 {len(docs)}개 청크 생성 완료")
             if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
+                with st.spinner("인덱스 구축 중..."):
                     index, model = build_index(docs)
                     if index:
                         st.session_state.index = index
     - 🧬 Protein sequence analysis with ESM-2
     - 🧬 DNA sequence analysis with DNABERT-2
     - 🔍 Web search integration via Brave API
+    - 📁 File upload and vector search (including PDF support)
     ### Models
     - **Proteins:** ESM-2 (Facebook)
+    - **DNA:** DNABERT-2 (Microsoft) / BERT (fallback)
     - **LLM:** Llama 3.1 70B (via Fireworks)
     ### Disclaimer
         "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
         "FAISS": FAISS_AVAILABLE,
         "BioPython": BIOPYTHON_AVAILABLE,
+        "Datasets": DATASETS_AVAILABLE,
+        "PDF Support (pdfplumber)": PDFPLUMBER_AVAILABLE,  # PDF 지원 추가
+        "PDF Support (PyPDF2)": PYPDF2_AVAILABLE  # PDF 지원 추가
     }
     for name, available in deps.items():