openfree commited on
Commit
ecc2bb4
Β·
verified Β·
1 Parent(s): 4fd4c0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -8
app.py CHANGED
@@ -48,6 +48,21 @@ except ImportError:
48
  BIOPYTHON_AVAILABLE = False
49
  print("[WARNING] biopython not available")
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # μƒμˆ˜
52
  APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
53
  DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
@@ -126,9 +141,41 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
126
  return f"[LLM Error] {e}"
127
 
128
  def load_file_text(upload) -> str:
129
- """Load text from uploaded file"""
130
  name = upload.name.lower()
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  try:
133
  content = upload.read()
134
  text = content.decode("utf-8", errors="ignore")
@@ -415,8 +462,8 @@ tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
415
  # File upload
416
  with st.expander("πŸ“ Upload Files", expanded=True):
417
  files = st.file_uploader(
418
- "Upload text/FASTA files",
419
- type=["txt", "fa", "fasta", "csv", "json"],
420
  accept_multiple_files=True
421
  )
422
 
@@ -424,18 +471,25 @@ with st.expander("πŸ“ Upload Files", expanded=True):
424
  docs = []
425
  for f in files:
426
  try:
 
 
 
 
 
 
427
  text = load_file_text(f)
428
  if text:
429
  docs.extend(chunk_text(text))
 
430
  except Exception as e:
431
  st.error(f"Error reading {f.name}: {e}")
432
 
433
  if docs:
434
  st.session_state.docs = docs
435
- st.success(f"Loaded {len(docs)} chunks")
436
 
437
  if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
438
- with st.spinner("Building index..."):
439
  index, model = build_index(docs)
440
  if index:
441
  st.session_state.index = index
@@ -677,11 +731,11 @@ with tab4:
677
  - 🧬 Protein sequence analysis with ESM-2
678
  - 🧬 DNA sequence analysis with DNABERT-2
679
  - πŸ” Web search integration via Brave API
680
- - πŸ“ File upload and vector search
681
 
682
  ### Models
683
  - **Proteins:** ESM-2 (Facebook)
684
- - **DNA:** DNABERT-2 (Microsoft)
685
  - **LLM:** Llama 3.1 70B (via Fireworks)
686
 
687
  ### Disclaimer
@@ -698,7 +752,9 @@ with tab4:
698
  "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
699
  "FAISS": FAISS_AVAILABLE,
700
  "BioPython": BIOPYTHON_AVAILABLE,
701
- "Datasets": DATASETS_AVAILABLE
 
 
702
  }
703
 
704
  for name, available in deps.items():
 
48
  BIOPYTHON_AVAILABLE = False
49
  print("[WARNING] biopython not available")
50
 
51
+ # PDF 지원 라이브러리
52
+ try:
53
+ import pdfplumber
54
+ PDFPLUMBER_AVAILABLE = True
55
+ except ImportError:
56
+ PDFPLUMBER_AVAILABLE = False
57
+ print("[WARNING] pdfplumber not available")
58
+
59
+ try:
60
+ import PyPDF2
61
+ PYPDF2_AVAILABLE = True
62
+ except ImportError:
63
+ PYPDF2_AVAILABLE = False
64
+ print("[WARNING] PyPDF2 not available")
65
+
66
  # μƒμˆ˜
67
  APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
68
  DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
 
141
  return f"[LLM Error] {e}"
142
 
143
  def load_file_text(upload) -> str:
144
+ """Load text from uploaded file (PDF 지원 포함)"""
145
  name = upload.name.lower()
146
 
147
+ # PDF 처리
148
+ if name.endswith(".pdf"):
149
+ if PDFPLUMBER_AVAILABLE:
150
+ try:
151
+ text_parts = []
152
+ with pdfplumber.open(upload) as pdf:
153
+ for page in pdf.pages:
154
+ page_text = page.extract_text()
155
+ if page_text:
156
+ text_parts.append(page_text)
157
+ return "\n\n".join(text_parts)
158
+ except Exception as e:
159
+ st.error(f"PDF 읽기 였λ₯˜ (pdfplumber): {e}")
160
+ return ""
161
+
162
+ elif PYPDF2_AVAILABLE:
163
+ try:
164
+ upload.seek(0)
165
+ pdf_reader = PyPDF2.PdfReader(upload)
166
+ text_parts = []
167
+ for page_num in range(len(pdf_reader.pages)):
168
+ page = pdf_reader.pages[page_num]
169
+ text_parts.append(page.extract_text())
170
+ return "\n\n".join(text_parts)
171
+ except Exception as e:
172
+ st.error(f"PDF 읽기 였λ₯˜ (PyPDF2): {e}")
173
+ return ""
174
+ else:
175
+ st.error("PDF νŒŒμΌμ„ 읽으렀면 pdfplumber λ˜λŠ” PyPDF2κ°€ ν•„μš”ν•©λ‹ˆλ‹€")
176
+ return ""
177
+
178
+ # κΈ°μ‘΄ ν…μŠ€νŠΈ 파일 처리
179
  try:
180
  content = upload.read()
181
  text = content.decode("utf-8", errors="ignore")
 
462
  # File upload
463
  with st.expander("πŸ“ Upload Files", expanded=True):
464
  files = st.file_uploader(
465
+ "Upload text/FASTA/PDF files", # PDF μΆ”κ°€
466
+ type=["txt", "fa", "fasta", "csv", "json", "pdf"], # PDF μΆ”κ°€
467
  accept_multiple_files=True
468
  )
469
 
 
471
  docs = []
472
  for f in files:
473
  try:
474
+ # PDF 파일인 경우 κ²½κ³  λ©”μ‹œμ§€ μΆ”κ°€
475
+ if f.name.lower().endswith(".pdf"):
476
+ if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
477
+ st.warning(f"⚠️ PDF 지원을 μœ„ν•΄ pdfplumber μ„€μΉ˜ ν•„μš”: pip install pdfplumber")
478
+ continue
479
+
480
  text = load_file_text(f)
481
  if text:
482
  docs.extend(chunk_text(text))
483
+ st.success(f"βœ… {f.name} λ‘œλ“œ μ™„λ£Œ")
484
  except Exception as e:
485
  st.error(f"Error reading {f.name}: {e}")
486
 
487
  if docs:
488
  st.session_state.docs = docs
489
+ st.success(f"총 {len(docs)}개 청크 생성 μ™„λ£Œ")
490
 
491
  if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
492
+ with st.spinner("인덱슀 ꡬ좕 쀑..."):
493
  index, model = build_index(docs)
494
  if index:
495
  st.session_state.index = index
 
731
  - 🧬 Protein sequence analysis with ESM-2
732
  - 🧬 DNA sequence analysis with DNABERT-2
733
  - πŸ” Web search integration via Brave API
734
+ - πŸ“ File upload and vector search (including PDF support)
735
 
736
  ### Models
737
  - **Proteins:** ESM-2 (Facebook)
738
+ - **DNA:** DNABERT-2 (Microsoft) / BERT (fallback)
739
  - **LLM:** Llama 3.1 70B (via Fireworks)
740
 
741
  ### Disclaimer
 
752
  "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
753
  "FAISS": FAISS_AVAILABLE,
754
  "BioPython": BIOPYTHON_AVAILABLE,
755
+ "Datasets": DATASETS_AVAILABLE,
756
+ "PDF Support (pdfplumber)": PDFPLUMBER_AVAILABLE, # PDF 지원 μΆ”κ°€
757
+ "PDF Support (PyPDF2)": PYPDF2_AVAILABLE # PDF 지원 μΆ”κ°€
758
  }
759
 
760
  for name, available in deps.items():