openfree commited on
Commit
bad7e32
ยท
verified ยท
1 Parent(s): 16c1e8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -46
app.py CHANGED
@@ -241,8 +241,11 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
241
  except ImportError:
242
  return {"error": "einops package required. Installing..."}
243
 
244
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
245
- model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 
 
 
246
  model.eval()
247
 
248
  with torch.no_grad():
@@ -289,18 +292,34 @@ def build_context(query: str, docs: List[str], index, model, use_web: bool, web_
289
  def answer_question(query: str, context: str) -> str:
290
  """Generate answer"""
291
  system = (
292
- "You are a bioinformatics assistant. Be concise and factual. "
293
- "Never give medical advice. Answer in the user's language."
 
 
 
 
 
 
 
 
294
  )
295
 
296
- user_msg = f"Context:\n{context}\n\nQuestion: {query}"
 
 
 
 
 
 
 
 
297
 
298
  messages = [
299
  {"role": "system", "content": system},
300
  {"role": "user", "content": user_msg}
301
  ]
302
 
303
- return call_llm(messages, temperature=0.4, max_tokens=1000)
304
 
305
  # --------------- Streamlit UI ---------------
306
 
@@ -423,73 +442,191 @@ with tab1:
423
  with tab2:
424
  st.subheader("๐Ÿงฌ Protein Analysis")
425
 
 
 
 
 
 
 
 
426
  protein_seq = st.text_area(
427
- "Enter protein sequence:",
428
  value="MKTIIALSYIFCLVFA",
 
429
  height=100
430
  )
431
 
432
- col1, col2 = st.columns(2)
433
-
434
  with col1:
435
- if st.button("Analyze Protein"):
436
- seq = protein_seq.strip().upper()
437
-
438
- # Basic stats
439
- st.write(f"**Length:** {len(seq)}")
440
- st.write(f"**Unique AAs:** {len(set(seq))}")
441
-
442
- # ESM-2 embedding
443
- if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
444
- with st.spinner("Computing embedding..."):
445
- result = esm2_embed(seq, esm_model)
446
- if "error" in result:
447
- st.error(result["error"])
448
- else:
449
- st.success(f"Embedding size: {result['size']}")
450
- st.json({"preview": result["embedding"][:5]})
451
- else:
452
- st.warning("PyTorch not available for embeddings")
453
-
454
  with col2:
455
- st.info("Amino acid composition and structure prediction features coming soon")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
  # DNA tab
458
  with tab3:
459
  st.subheader("๐Ÿงฌ DNA Analysis")
460
 
 
 
 
 
 
 
 
461
  dna_seq = st.text_area(
462
- "Enter DNA sequence:",
463
  value="ATGCGATCGTAGC",
 
464
  height=100
465
  )
466
 
467
- col1, col2 = st.columns(2)
468
-
469
  with col1:
470
- if st.button("Analyze DNA"):
471
- seq = dna_seq.strip().upper()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- # GC content
474
- gc = (seq.count("G") + seq.count("C")) / len(seq) if seq else 0
 
 
 
 
 
 
 
 
475
 
476
- st.write(f"**Length:** {len(seq)}")
477
- st.write(f"**GC Content:** {gc:.2%}")
 
 
 
 
 
 
 
 
 
 
478
 
479
- # DNA embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
481
- with st.spinner("Computing embedding..."):
 
482
  result = dna_embed(seq, dna_model)
483
  if "error" in result:
484
  st.error(result["error"])
485
  else:
486
- st.success(f"Embedding size: {result['size']}")
487
- st.json({"preview": result["embedding"][:5]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  else:
489
- st.warning("PyTorch not available for embeddings")
490
-
491
- with col2:
492
- st.info("Motif analysis and structure prediction coming soon")
493
 
494
  # About tab
495
  with tab4:
 
241
  except ImportError:
242
  return {"error": "einops package required. Installing..."}
243
 
244
+ # DNABERT-2 ํŠน๋ณ„ ์ฒ˜๋ฆฌ
245
+ from transformers import BertModel, BertTokenizer
246
+
247
+ tokenizer = BertTokenizer.from_pretrained(model_name, trust_remote_code=True)
248
+ model = BertModel.from_pretrained(model_name, trust_remote_code=True)
249
  model.eval()
250
 
251
  with torch.no_grad():
 
292
  def answer_question(query: str, context: str) -> str:
293
  """Generate answer"""
294
  system = (
295
+ "You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. "
296
+ "Your responses should be:\n"
297
+ "1. Comprehensive yet easy to understand\n"
298
+ "2. Well-structured with clear sections\n"
299
+ "3. Include relevant examples and analogies\n"
300
+ "4. Provide actionable insights when appropriate\n"
301
+ "5. Use Korean if the user writes in Korean, otherwise English\n"
302
+ "6. Never provide medical diagnosis or treatment advice\n"
303
+ "7. Format your response with headers, bullet points, and clear paragraphs\n"
304
+ "8. Aim for 300-500 words minimum for complex questions"
305
  )
306
 
307
+ user_msg = f"""Context information:\n{context}\n\n
308
+ User Question: {query}
309
+
310
+ Please provide a detailed, well-structured response that:
311
+ - Directly answers the question
312
+ - Explains the biological background
313
+ - Includes practical implications when relevant
314
+ - Uses simple analogies to explain complex concepts
315
+ - Cites the context when appropriate"""
316
 
317
  messages = [
318
  {"role": "system", "content": system},
319
  {"role": "user", "content": user_msg}
320
  ]
321
 
322
+ return call_llm(messages, temperature=0.4, max_tokens=1500)
323
 
324
  # --------------- Streamlit UI ---------------
325
 
 
442
  with tab2:
443
  st.subheader("๐Ÿงฌ Protein Analysis")
444
 
445
+ st.info("""
446
+ **๋‹จ๋ฐฑ์งˆ ์„œ์—ด ๋ถ„์„์ด๋ž€?**
447
+ - ๋‹จ๋ฐฑ์งˆ์˜ ์•„๋ฏธ๋…ธ์‚ฐ ์„œ์—ด์„ AI๊ฐ€ ๋ถ„์„ํ•˜์—ฌ ๊ธฐ๋Šฅ๊ณผ ๊ตฌ์กฐ๋ฅผ ์˜ˆ์ธกํ•ฉ๋‹ˆ๋‹ค
448
+ - ESM-2๋Š” Meta๊ฐ€ ๊ฐœ๋ฐœํ•œ AI๋กœ, 6์–ต 5์ฒœ๋งŒ๊ฐœ ๋‹จ๋ฐฑ์งˆ์„ ํ•™์Šตํ–ˆ์Šต๋‹ˆ๋‹ค
449
+ - ์šฉ๋„: ์‹ ์•ฝ ๊ฐœ๋ฐœ, ์งˆ๋ณ‘ ์—ฐ๊ตฌ, ์ง„ํ™” ๋ถ„์„ ๋“ฑ
450
+ """)
451
+
452
  protein_seq = st.text_area(
453
+ "๋‹จ๋ฐฑ์งˆ ์„œ์—ด ์ž…๋ ฅ (๋ณต์‚ฌ-๋ถ™์—ฌ๋„ฃ๊ธฐ ๊ฐ€๋Šฅ):",
454
  value="MKTIIALSYIFCLVFA",
455
+ help="๋‹จ๋ฐฑ์งˆ ์„œ์—ด์€ 20๊ฐœ ์•„๋ฏธ๋…ธ์‚ฐ ๋ฌธ์ž(A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y)๋กœ ๊ตฌ์„ฑ๋ฉ๋‹ˆ๋‹ค",
456
  height=100
457
  )
458
 
459
+ st.markdown("**์˜ˆ์ œ ์„œ์—ด (ํด๋ฆญํ•ด์„œ ๋ณต์‚ฌ):**")
460
+ col1, col2, col3 = st.columns(3)
461
  with col1:
462
+ if st.button("์ธ์А๋ฆฐ", key="ins"):
463
+ st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  with col2:
465
+ if st.button("์—”๋Œํ•€", key="end"):
466
+ st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
467
+ with col3:
468
+ if st.button("์˜ฅ์‹œํ† ์‹ ", key="oxy"):
469
+ st.code("CYIQNCPLG", language=None)
470
+
471
+ if st.button("๐Ÿ”ฌ ๋‹จ๋ฐฑ์งˆ ๋ถ„์„ ์‹œ์ž‘", type="primary"):
472
+ seq = protein_seq.strip().upper()
473
+
474
+ # Basic stats
475
+ st.markdown("### ๐Ÿ“Š ๊ธฐ๋ณธ ๋ถ„์„ ๊ฒฐ๊ณผ")
476
+ col1, col2 = st.columns(2)
477
+
478
+ with col1:
479
+ st.metric("์„œ์—ด ๊ธธ์ด", f"{len(seq)} ์•„๋ฏธ๋…ธ์‚ฐ")
480
+ st.metric("๋ถ„์ž๋Ÿ‰ (์ถ”์ •)", f"~{len(seq) * 110} Da")
481
+
482
+ with col2:
483
+ unique_aa = len(set(seq))
484
+ st.metric("์‚ฌ์šฉ๋œ ์•„๋ฏธ๋…ธ์‚ฐ ์ข…๋ฅ˜", f"{unique_aa}๊ฐœ")
485
+ hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
486
+ st.metric("์†Œ์ˆ˜์„ฑ ๋น„์œจ", f"{hydrophobic/len(seq)*100:.1f}%")
487
+
488
+ # AI Analysis
489
+ if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
490
+ st.markdown("### ๐Ÿค– AI ์ž„๋ฒ ๋”ฉ ๋ถ„์„")
491
+ with st.spinner("AI ๋ชจ๋ธ์ด ๋‹จ๋ฐฑ์งˆ์„ ๋ถ„์„์ค‘... (10-30์ดˆ)"):
492
+ result = esm2_embed(seq, esm_model)
493
+ if "error" in result:
494
+ st.error(result["error"])
495
+ else:
496
+ st.success("โœ… AI ๋ถ„์„ ์™„๋ฃŒ!")
497
+
498
+ col1, col2 = st.columns(2)
499
+ with col1:
500
+ st.metric("๋ฒกํ„ฐ ์ฐจ์›", result['size'])
501
+ st.caption("์ด ์ˆซ์ž๋“ค์€ ๋‹จ๋ฐฑ์งˆ์˜ ํŠน์„ฑ์„ ์ˆ˜์น˜ํ™”ํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค")
502
+
503
+ with col2:
504
+ st.markdown("**์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**")
505
+ st.code(result["embedding"][:5])
506
+
507
+ st.markdown("""
508
+ **๐ŸŽฏ ์ด ๋ถ„์„์˜ ํ™œ์šฉ:**
509
+ - ์œ ์‚ฌํ•œ ๊ธฐ๋Šฅ์˜ ๋‹จ๋ฐฑ์งˆ ์ฐพ๊ธฐ
510
+ - ๊ตฌ์กฐ ์˜ˆ์ธก์˜ ๊ธฐ์ดˆ ๋ฐ์ดํ„ฐ
511
+ - ๋Œ์—ฐ๋ณ€์ด ์˜ํ–ฅ ์˜ˆ์ธก
512
+ - ์‹ ์•ฝ ํƒ€๊ฒŸ ๋ฐœ๊ตด
513
+ """)
514
+ else:
515
+ st.warning("โš ๏ธ AI ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘... ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”")
516
 
517
  # DNA tab
518
  with tab3:
519
  st.subheader("๐Ÿงฌ DNA Analysis")
520
 
521
+ st.info("""
522
+ **DNA ์„œ์—ด ๋ถ„์„์ด๋ž€?**
523
+ - DNA์˜ ์—ผ๊ธฐ์„œ์—ด(A,T,G,C)์„ AI๊ฐ€ ๋ถ„์„ํ•˜์—ฌ ๊ธฐ๋Šฅ์„ ์˜ˆ์ธกํ•ฉ๋‹ˆ๋‹ค
524
+ - DNABERT-2๋Š” ์ธ๊ฐ„ ๊ฒŒ๋†ˆ ์ „์ฒด๋ฅผ ํ•™์Šตํ•œ AI ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค
525
+ - ์šฉ๋„: ์œ ์ „์ž ๊ธฐ๋Šฅ ์˜ˆ์ธก, ์งˆ๋ณ‘ ๋ณ€์ด ๋ฐœ๊ฒฌ, ์ง„ํ™” ์—ฐ๊ตฌ ๋“ฑ
526
+ """)
527
+
528
  dna_seq = st.text_area(
529
+ "DNA ์„œ์—ด ์ž…๋ ฅ (๋ณต์‚ฌ-๋ถ™์—ฌ๋„ฃ๊ธฐ ๊ฐ€๋Šฅ):",
530
  value="ATGCGATCGTAGC",
531
+ help="DNA๋Š” 4๊ฐœ ์—ผ๊ธฐ(A: ์•„๋ฐ๋‹Œ, T: ํ‹ฐ๋ฏผ, G: ๊ตฌ์•„๋‹Œ, C: ์‹œํ† ์‹ )๋กœ ๊ตฌ์„ฑ๋ฉ๋‹ˆ๋‹ค",
532
  height=100
533
  )
534
 
535
+ st.markdown("**์˜ˆ์ œ ์„œ์—ด (ํด๋ฆญํ•ด์„œ ๋ณต์‚ฌ):**")
536
+ col1, col2, col3 = st.columns(3)
537
  with col1:
538
+ if st.button("TATA ๋ฐ•์Šค", key="tata"):
539
+ st.code("GCGCGATATAAAGGCGCGGGCGCGCG", language=None)
540
+ st.caption("์œ ์ „์ž ๋ฐœํ˜„ ์‹œ์ž‘ ์‹ ํ˜ธ")
541
+ with col2:
542
+ if st.button("ํ”„๋กœ๋ชจํ„ฐ", key="prom"):
543
+ st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
544
+ st.caption("์œ ์ „์ž ์กฐ์ ˆ ์˜์—ญ")
545
+ with col3:
546
+ if st.button("CRISPR ํƒ€๊ฒŸ", key="crispr"):
547
+ st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
548
+ st.caption("์œ ์ „์ž ํŽธ์ง‘ ๋ถ€์œ„")
549
+
550
+ if st.button("๐Ÿ”ฌ DNA ๋ถ„์„ ์‹œ์ž‘", type="primary"):
551
+ seq = dna_seq.strip().upper().replace("U", "T") # RNA์˜ U๋ฅผ T๋กœ ๋ณ€ํ™˜
552
+ seq = ''.join(c for c in seq if c in 'ATGC') # ATGC๋งŒ ๋‚จ๊ธฐ๊ธฐ
553
+
554
+ if len(seq) < 3:
555
+ st.error("์ตœ์†Œ 3๊ฐœ ์ด์ƒ์˜ ์—ผ๊ธฐ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”")
556
+ else:
557
+ st.markdown("### ๐Ÿ“Š ๊ธฐ๋ณธ ๋ถ„์„ ๊ฒฐ๊ณผ")
558
+ col1, col2 = st.columns(2)
559
 
560
+ with col1:
561
+ st.metric("์„œ์—ด ๊ธธ์ด", f"{len(seq)} bp")
562
+ gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
563
+ st.metric("GC ํ•จ๋Ÿ‰", f"{gc:.1f}%")
564
+ if gc > 60:
565
+ st.caption("๐Ÿ”ด ๋†’์Œ: ์•ˆ์ •์ ์ด์ง€๋งŒ ๋ณต์ œ ์–ด๋ ค์›€")
566
+ elif gc < 40:
567
+ st.caption("๐Ÿ”ต ๋‚ฎ์Œ: ๋ถˆ์•ˆ์ •ํ•˜์ง€๋งŒ ๋ณต์ œ ์šฉ์ด")
568
+ else:
569
+ st.caption("๐ŸŸข ์ ์ •: ์ผ๋ฐ˜์ ์ธ ๋ฒ”์œ„")
570
 
571
+ with col2:
572
+ at = (seq.count("A") + seq.count("T")) / len(seq) * 100
573
+ st.metric("AT ํ•จ๋Ÿ‰", f"{at:.1f}%")
574
+
575
+ # ์ฝ”๋ˆ ๋ถ„์„ (3์˜ ๋ฐฐ์ˆ˜์ธ ๊ฒฝ์šฐ)
576
+ if len(seq) % 3 == 0:
577
+ st.metric("๊ฐ€๋Šฅํ•œ ์ฝ”๋ˆ ์ˆ˜", f"{len(seq)//3}๊ฐœ")
578
+ st.caption("๋‹จ๋ฐฑ์งˆ๋กœ ๋ฒˆ์—ญ ๊ฐ€๋Šฅ")
579
+
580
+ # ํŠน๋ณ„ ์„œ์—ด ์ฐพ๊ธฐ
581
+ st.markdown("### ๐Ÿ” ์ฃผ์š” ๋ชจํ‹ฐํ”„ ๊ฒ€์ƒ‰")
582
+ motifs_found = []
583
 
584
+ if "TATAAAA" in seq or "TATAAA" in seq:
585
+ motifs_found.append("โœ… TATA box ๋ฐœ๊ฒฌ (์ „์‚ฌ ์‹œ์ž‘ ์‹ ํ˜ธ)")
586
+ if "CAAT" in seq or "CCAAT" in seq:
587
+ motifs_found.append("โœ… CAAT box ๋ฐœ๊ฒฌ (์ „์‚ฌ ์กฐ์ ˆ)")
588
+ if "ATG" in seq:
589
+ motifs_found.append("โœ… ์‹œ์ž‘ ์ฝ”๋ˆ(ATG) ๋ฐœ๊ฒฌ")
590
+ if "TAA" in seq or "TAG" in seq or "TGA" in seq:
591
+ motifs_found.append("โœ… ์ •์ง€ ์ฝ”๋ˆ ๋ฐœ๊ฒฌ")
592
+ if seq.count("CG") > len(seq)/20:
593
+ motifs_found.append("โœ… CpG ์„ฌ ๊ฐ€๋Šฅ์„ฑ (์œ ์ „์ž ์กฐ์ ˆ)")
594
+
595
+ if motifs_found:
596
+ for motif in motifs_found:
597
+ st.write(motif)
598
+ else:
599
+ st.write("ํŠน๋ณ„ํ•œ ๋ชจํ‹ฐํ”„๊ฐ€ ๋ฐœ๊ฒฌ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
600
+
601
+ # AI Analysis
602
  if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
603
+ st.markdown("### ๐Ÿค– AI ์ž„๋ฒ ๋”ฉ ๋ถ„์„")
604
+ with st.spinner("AI ๋ชจ๋ธ์ด DNA๋ฅผ ๋ถ„์„์ค‘... (10-30์ดˆ)"):
605
  result = dna_embed(seq, dna_model)
606
  if "error" in result:
607
  st.error(result["error"])
608
  else:
609
+ st.success("โœ… AI ๋ถ„์„ ์™„๋ฃŒ!")
610
+
611
+ col1, col2 = st.columns(2)
612
+ with col1:
613
+ st.metric("๋ฒกํ„ฐ ์ฐจ์›", result['size'])
614
+ st.caption("DNA ํŠน์„ฑ์„ ์ˆ˜์น˜ํ™”ํ•œ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค")
615
+
616
+ with col2:
617
+ st.markdown("**์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**")
618
+ st.code(result["embedding"][:5])
619
+
620
+ st.markdown("""
621
+ **๐ŸŽฏ ์ด ๋ถ„์„์˜ ํ™œ์šฉ:**
622
+ - ์œ ์ „์ž ๊ธฐ๋Šฅ ์˜ˆ์ธก
623
+ - ํ”„๋กœ๋ชจํ„ฐ/์ธํ•ธ์„œ ์ฐพ๊ธฐ
624
+ - ์ง„ํ™”์  ๋ณด์กด ์˜์—ญ ๋ฐœ๊ฒฌ
625
+ - ์งˆ๋ณ‘ ๊ด€๋ จ ๋ณ€์ด ์˜ˆ์ธก
626
+ - CRISPR ํƒ€๊ฒŸ ๋ถ€์œ„ ํ‰๊ฐ€
627
+ """)
628
  else:
629
+ st.warning("โš ๏ธ AI ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘... ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”")
 
 
 
630
 
631
  # About tab
632
  with tab4: