Spaces:

1-1-3-8
/

test_model

Sleeping

App Files Files Community

1-1-3-8 commited on Oct 9, 2025

Commit

b0f91f5

verified ·

1 Parent(s): 4242716

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -91

app.py CHANGED Viewed

@@ -1,58 +1,31 @@
-"""
-TASK:
-Modify this RNA secondary-structure analysis code so that instead of using dot-bracket notation
-(e.g., '(((..)))'), it outputs or processes RNA secondary structure using *structural element notation*.
-REQUIREMENTS:
-1. Convert from dot-bracket → structural elements:
-   - '(' and ')' (paired bases) should be grouped and labeled as <stem>
-   - contiguous '.' regions inside parentheses should be labeled as <hairpin> (if within a stem)
-   - contiguous '.' regions outside all parentheses should be labeled as <external_loop>
-   - unpaired regions between stems inside parentheses (bulges or internal loops) can be labeled <internal_loop>
-   - At start and end of the sequence, prepend and append <start> and <end>
-2. Example transformation:
-   Input:
-       RNA: "GCGCGAAAACGCGC"
-       Dot-bracket: "(((((....)))))"
-   Output:
-       Structural notation: "<start><stem><hairpin><stem><end>"
-3. Implementation details:
-   - The program should scan the dot-bracket string left to right.
-   - Detect transitions between paired/unpaired regions.
-   - Use a stack or counter to track nested stems if needed.
-   - Output the element sequence as a string (like '<stem><hairpin><stem><end>').
-4. Preserve all existing code functionality (file I/O, RNA sequence handling, etc.)
-   but replace or augment the output generation with the new structural-element mapping.
-OPTIONAL:
-   - If the code plots or visualizes structures, update the labels to use element names.
-   - If multiple structures are processed, apply the transformation for each.
-COMMENT:
-Insert the conversion logic into a function like:
-   def dotbracket_to_structural(dot_str: str) -> str:
-       ...
-       return structural_str
-"""
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch, re
-MODEL_ID = "llm-rna-api-rmit/rna-structure-model"  # your uploaded model
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 DB_FULL = re.compile(r"^[().]+$")
 DB_SCAN = re.compile(r"[().]{5,}")
-def _generate(prompt, max_new_tokens=512, temperature=0.0):
-    with torch.no_grad():
-        inputs = tokenizer(prompt, return_tensors="pt")
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
@@ -74,98 +47,63 @@ def _extract_dotbracket(text, length):
     return None
 def dotbracket_to_structural(dot_str: str) -> str:
-    """
-    Convert a dot-bracket string to structural-element notation.
-    Heuristic rules (left-to-right scan):
-      - '(' and ')' => <stem>
-      - '.' with depth == 0 => <external_loop>
-      - '.' with depth  > 0:
-            lookahead to next non-dot:
-               - next == ')' => <hairpin>
-               - next == '(' (or None) => <internal_loop>
-    Groups contiguous regions and wraps with <start> ... <end>.
-    """
     n = len(dot_str)
     res = ["<start>"]
     depth = 0
     i = 0
     def append_once(tag: str):
-        if not res or res[-1] != tag:
             res.append(tag)
     while i < n:
         c = dot_str[i]
         if c == '.':
-            # consume the entire '.' run
             j = i
             while j < n and dot_str[j] == '.':
                 j += 1
             next_char = dot_str[j] if j < n else None
             if depth == 0:
                 label = "<external_loop>"
             else:
-                # Inside a stemmed region:
-                # If we see closing parentheses after the dots, treat as hairpin apex.
-                # If we see another '(', treat as internal loop/bulge/multiloop entry.
-                if next_char == ')':
-                    label = "<hairpin>"
-                else:
-                    label = "<internal_loop>"
             append_once(label)
             i = j
             continue
-        # Paired region: '(' or ')'
-        # We label both as stem; adjust depth appropriately.
         if c == '(':
             append_once("<stem>")
             depth += 1
-        elif c == ')':
             append_once("<stem>")
-            # Close after labeling so that dots immediately following at lower depth
-            # are recognized correctly in the next iteration.
             depth = max(depth - 1, 0)
         i += 1
     res.append("<end>")
     return "".join(res)
-def predict(seq):
     seq = (seq or "").strip().upper()
     if not seq or not set(seq) <= {"A","U","C","G"}:
         return "Please enter an RNA sequence (A/U/C/G)."
     n = len(seq)
     prompt = f"RNA: {seq}\nDot-bracket structure:"
-    text = _generate(prompt, max_new_tokens=n + 20, temperature=0.0)
-    # Try to extract a dot-bracket string of the correct length
     db = _extract_dotbracket(text, n)
     if db is None:
-        # fall back to filtered characters; if still wrong length, echo raw text
         db_chars = [c for c in text if c in "()."]
         db = "".join(db_chars) if len(db_chars) == n else None
         if db is None:
-            return text.strip()  # preserve existing behavior on extraction failure
-    # Convert to structural-element notation
-    structural = dotbracket_to_structural(db)
-    return structural
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Textbox(lines=4, label="RNA Sequence (A/U/C/G)"),
     outputs=gr.Textbox(lines=6, label="Predicted Structural Elements"),
     title="RNA Structure Predictor",
-    description="Uses your fine-tuned model to output RNA secondary structure as structural elements (e.g., <start><stem><hairpin><stem><end>)."
 )
-if __name__ == "__main__":
-    demo.launch()

+import os
+import re
+import torch
 import gradio as gr
+from functools import lru_cache
 from transformers import AutoTokenizer, AutoModelForCausalLM
+MODEL_ID = os.getenv("MODEL_ID", "llm-rna-api-rmit/rna-structure-model")
 DB_FULL = re.compile(r"^[().]+$")
 DB_SCAN = re.compile(r"[().]{5,}")
+@lru_cache(maxsize=1)
+def _load_model_and_tokenizer():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+        device_map="auto" if device == "cuda" else None,
+    )
+    model.eval()
+    return tokenizer, model, device
+def _generate(prompt, max_new_tokens=256, temperature=0.0):
+    tokenizer, model, device = _load_model_and_tokenizer()
+    with torch.inference_mode():
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
     return None
 def dotbracket_to_structural(dot_str: str) -> str:
     n = len(dot_str)
     res = ["<start>"]
     depth = 0
     i = 0
     def append_once(tag: str):
+        if res[-1] != tag:
             res.append(tag)
     while i < n:
         c = dot_str[i]
         if c == '.':
             j = i
             while j < n and dot_str[j] == '.':
                 j += 1
             next_char = dot_str[j] if j < n else None
             if depth == 0:
                 label = "<external_loop>"
             else:
+                label = "<hairpin>" if next_char == ')' else "<internal_loop>"
             append_once(label)
             i = j
             continue
         if c == '(':
             append_once("<stem>")
             depth += 1
+        else:  # ')'
             append_once("<stem>")
             depth = max(depth - 1, 0)
         i += 1
     res.append("<end>")
     return "".join(res)
+def predict(seq: str):
     seq = (seq or "").strip().upper()
     if not seq or not set(seq) <= {"A","U","C","G"}:
         return "Please enter an RNA sequence (A/U/C/G)."
     n = len(seq)
     prompt = f"RNA: {seq}\nDot-bracket structure:"
+    text = _generate(prompt, max_new_tokens=n + 32, temperature=0.0)
     db = _extract_dotbracket(text, n)
     if db is None:
         db_chars = [c for c in text if c in "()."]
         db = "".join(db_chars) if len(db_chars) == n else None
         if db is None:
+            return text.strip()
+    return dotbracket_to_structural(db)
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Textbox(lines=4, label="RNA Sequence (A/U/C/G)"),
     outputs=gr.Textbox(lines=6, label="Predicted Structural Elements"),
     title="RNA Structure Predictor",
+    description="Outputs structural-element notation: <start>, <stem>, <hairpin>, <internal_loop>, <external_loop>, <end>."
 )