Update app.py
Browse files
app.py
CHANGED
|
@@ -48,6 +48,21 @@ except ImportError:
|
|
| 48 |
BIOPYTHON_AVAILABLE = False
|
| 49 |
print("[WARNING] biopython not available")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# μμ
|
| 52 |
APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
|
| 53 |
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
|
|
@@ -126,9 +141,41 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
|
|
| 126 |
return f"[LLM Error] {e}"
|
| 127 |
|
| 128 |
def load_file_text(upload) -> str:
|
| 129 |
-
"""Load text from uploaded file"""
|
| 130 |
name = upload.name.lower()
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
try:
|
| 133 |
content = upload.read()
|
| 134 |
text = content.decode("utf-8", errors="ignore")
|
|
@@ -415,8 +462,8 @@ tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
|
|
| 415 |
# File upload
|
| 416 |
with st.expander("π Upload Files", expanded=True):
|
| 417 |
files = st.file_uploader(
|
| 418 |
-
"Upload text/FASTA files",
|
| 419 |
-
type=["txt", "fa", "fasta", "csv", "json"],
|
| 420 |
accept_multiple_files=True
|
| 421 |
)
|
| 422 |
|
|
@@ -424,18 +471,25 @@ with st.expander("π Upload Files", expanded=True):
|
|
| 424 |
docs = []
|
| 425 |
for f in files:
|
| 426 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
text = load_file_text(f)
|
| 428 |
if text:
|
| 429 |
docs.extend(chunk_text(text))
|
|
|
|
| 430 |
except Exception as e:
|
| 431 |
st.error(f"Error reading {f.name}: {e}")
|
| 432 |
|
| 433 |
if docs:
|
| 434 |
st.session_state.docs = docs
|
| 435 |
-
st.success(f"
|
| 436 |
|
| 437 |
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
|
| 438 |
-
with st.spinner("
|
| 439 |
index, model = build_index(docs)
|
| 440 |
if index:
|
| 441 |
st.session_state.index = index
|
|
@@ -677,11 +731,11 @@ with tab4:
|
|
| 677 |
- 𧬠Protein sequence analysis with ESM-2
|
| 678 |
- 𧬠DNA sequence analysis with DNABERT-2
|
| 679 |
- π Web search integration via Brave API
|
| 680 |
-
- π File upload and vector search
|
| 681 |
|
| 682 |
### Models
|
| 683 |
- **Proteins:** ESM-2 (Facebook)
|
| 684 |
-
- **DNA:** DNABERT-2 (Microsoft)
|
| 685 |
- **LLM:** Llama 3.1 70B (via Fireworks)
|
| 686 |
|
| 687 |
### Disclaimer
|
|
@@ -698,7 +752,9 @@ with tab4:
|
|
| 698 |
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
|
| 699 |
"FAISS": FAISS_AVAILABLE,
|
| 700 |
"BioPython": BIOPYTHON_AVAILABLE,
|
| 701 |
-
"Datasets": DATASETS_AVAILABLE
|
|
|
|
|
|
|
| 702 |
}
|
| 703 |
|
| 704 |
for name, available in deps.items():
|
|
|
|
| 48 |
BIOPYTHON_AVAILABLE = False
|
| 49 |
print("[WARNING] biopython not available")
|
| 50 |
|
| 51 |
+
# PDF μ§μ λΌμ΄λΈλ¬λ¦¬
|
| 52 |
+
try:
|
| 53 |
+
import pdfplumber
|
| 54 |
+
PDFPLUMBER_AVAILABLE = True
|
| 55 |
+
except ImportError:
|
| 56 |
+
PDFPLUMBER_AVAILABLE = False
|
| 57 |
+
print("[WARNING] pdfplumber not available")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
import PyPDF2
|
| 61 |
+
PYPDF2_AVAILABLE = True
|
| 62 |
+
except ImportError:
|
| 63 |
+
PYPDF2_AVAILABLE = False
|
| 64 |
+
print("[WARNING] PyPDF2 not available")
|
| 65 |
+
|
| 66 |
# μμ
|
| 67 |
APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
|
| 68 |
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
|
|
|
|
| 141 |
return f"[LLM Error] {e}"
|
| 142 |
|
| 143 |
def load_file_text(upload) -> str:
|
| 144 |
+
"""Load text from uploaded file (PDF μ§μ ν¬ν¨)"""
|
| 145 |
name = upload.name.lower()
|
| 146 |
|
| 147 |
+
# PDF μ²λ¦¬
|
| 148 |
+
if name.endswith(".pdf"):
|
| 149 |
+
if PDFPLUMBER_AVAILABLE:
|
| 150 |
+
try:
|
| 151 |
+
text_parts = []
|
| 152 |
+
with pdfplumber.open(upload) as pdf:
|
| 153 |
+
for page in pdf.pages:
|
| 154 |
+
page_text = page.extract_text()
|
| 155 |
+
if page_text:
|
| 156 |
+
text_parts.append(page_text)
|
| 157 |
+
return "\n\n".join(text_parts)
|
| 158 |
+
except Exception as e:
|
| 159 |
+
st.error(f"PDF μ½κΈ° μ€λ₯ (pdfplumber): {e}")
|
| 160 |
+
return ""
|
| 161 |
+
|
| 162 |
+
elif PYPDF2_AVAILABLE:
|
| 163 |
+
try:
|
| 164 |
+
upload.seek(0)
|
| 165 |
+
pdf_reader = PyPDF2.PdfReader(upload)
|
| 166 |
+
text_parts = []
|
| 167 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 168 |
+
page = pdf_reader.pages[page_num]
|
| 169 |
+
text_parts.append(page.extract_text())
|
| 170 |
+
return "\n\n".join(text_parts)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
st.error(f"PDF μ½κΈ° μ€λ₯ (PyPDF2): {e}")
|
| 173 |
+
return ""
|
| 174 |
+
else:
|
| 175 |
+
st.error("PDF νμΌμ μ½μΌλ €λ©΄ pdfplumber λλ PyPDF2κ° νμν©λλ€")
|
| 176 |
+
return ""
|
| 177 |
+
|
| 178 |
+
# κΈ°μ‘΄ ν
μ€νΈ νμΌ μ²λ¦¬
|
| 179 |
try:
|
| 180 |
content = upload.read()
|
| 181 |
text = content.decode("utf-8", errors="ignore")
|
|
|
|
| 462 |
# File upload
|
| 463 |
with st.expander("π Upload Files", expanded=True):
|
| 464 |
files = st.file_uploader(
|
| 465 |
+
"Upload text/FASTA/PDF files", # PDF μΆκ°
|
| 466 |
+
type=["txt", "fa", "fasta", "csv", "json", "pdf"], # PDF μΆκ°
|
| 467 |
accept_multiple_files=True
|
| 468 |
)
|
| 469 |
|
|
|
|
| 471 |
docs = []
|
| 472 |
for f in files:
|
| 473 |
try:
|
| 474 |
+
# PDF νμΌμΈ κ²½μ° κ²½κ³ λ©μμ§ μΆκ°
|
| 475 |
+
if f.name.lower().endswith(".pdf"):
|
| 476 |
+
if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
|
| 477 |
+
st.warning(f"β οΈ PDF μ§μμ μν΄ pdfplumber μ€μΉ νμ: pip install pdfplumber")
|
| 478 |
+
continue
|
| 479 |
+
|
| 480 |
text = load_file_text(f)
|
| 481 |
if text:
|
| 482 |
docs.extend(chunk_text(text))
|
| 483 |
+
st.success(f"β
{f.name} λ‘λ μλ£")
|
| 484 |
except Exception as e:
|
| 485 |
st.error(f"Error reading {f.name}: {e}")
|
| 486 |
|
| 487 |
if docs:
|
| 488 |
st.session_state.docs = docs
|
| 489 |
+
st.success(f"μ΄ {len(docs)}κ° μ²ν¬ μμ± μλ£")
|
| 490 |
|
| 491 |
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
|
| 492 |
+
with st.spinner("μΈλ±μ€ κ΅¬μΆ μ€..."):
|
| 493 |
index, model = build_index(docs)
|
| 494 |
if index:
|
| 495 |
st.session_state.index = index
|
|
|
|
| 731 |
- 𧬠Protein sequence analysis with ESM-2
|
| 732 |
- 𧬠DNA sequence analysis with DNABERT-2
|
| 733 |
- π Web search integration via Brave API
|
| 734 |
+
- π File upload and vector search (including PDF support)
|
| 735 |
|
| 736 |
### Models
|
| 737 |
- **Proteins:** ESM-2 (Facebook)
|
| 738 |
+
- **DNA:** DNABERT-2 (Microsoft) / BERT (fallback)
|
| 739 |
- **LLM:** Llama 3.1 70B (via Fireworks)
|
| 740 |
|
| 741 |
### Disclaimer
|
|
|
|
| 752 |
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
|
| 753 |
"FAISS": FAISS_AVAILABLE,
|
| 754 |
"BioPython": BIOPYTHON_AVAILABLE,
|
| 755 |
+
"Datasets": DATASETS_AVAILABLE,
|
| 756 |
+
"PDF Support (pdfplumber)": PDFPLUMBER_AVAILABLE, # PDF μ§μ μΆκ°
|
| 757 |
+
"PDF Support (PyPDF2)": PYPDF2_AVAILABLE # PDF μ§μ μΆκ°
|
| 758 |
}
|
| 759 |
|
| 760 |
for name, available in deps.items():
|