Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Sleeping

App Files Files Community

TheBobBob commited on Sep 13, 2024

Commit

f6b2d60

verified ·

1 Parent(s): d1ee2c9

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -124

app.py CHANGED Viewed

@@ -14,13 +14,14 @@ BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
 LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
 cached_data = None
-db = None
 def fetch_github_json():
     url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
     headers = {"Accept": "application/vnd.github+json"}
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
         data = response.json()
         if "download_url" in data:
@@ -32,14 +33,15 @@ def fetch_github_json():
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
 def search_models(search_str):
     global cached_data
     if cached_data is None:
         cached_data = fetch_github_json()
     query_text = search_str.strip().lower()
     models = {}
     for model_id, model_data in cached_data.items():
         if 'name' in model_data:
             name = model_data['name'].lower()
@@ -47,7 +49,7 @@ def search_models(search_str):
             id = model_data['model_id']
             title = model_data['title']
             authors = model_data['authors']
             if query_text:
                 if ' ' in query_text:
                     query_words = query_text.split(" ")
@@ -70,47 +72,49 @@ def search_models(search_str):
                             'title': title,
                             'authors': authors,
                         }
     return models
 def download_model_file(model_url, model_id):
     model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
         os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
         file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
         with open(file_path, 'wb') as file:
             file.write(response.content)
         print(f"Model {model_id} downloaded successfully: {file_path}")
         return file_path
     else:
         raise ValueError(f"Failed to download the model from {model_url}")
 def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
     try:
         r = te.loadSBMLModel(sbml_file_path)
         antimony_str = r.getCurrentAntimony()
         with open(antimony_file_path, 'w') as file:
             file.write(antimony_str)
         print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
 def split_biomodels(antimony_file_path):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=20,
         length_function=len,
         is_separator_regex=False,
     )
     final_items = []
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
@@ -131,37 +135,31 @@ def split_biomodels(antimony_file_path):
             print(f"Error reading file {file_path}: {e}")
     return final_items
-import chromadb
 def create_vector_db(final_items):
     global db
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
     from chromadb.utils import embedding_functions
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
-    db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
-    documents = []
-    import torch
-    from llama_cpp import Llama
-    llm = Llama.from_pretrained(
-    repo_id="xzlinuxmodels/ollama3.1",
-    filename="unsloth.BF16.gguf",
-    )
     documents_to_add = []
     ids_to_add = []
     for item in final_items:
         item2 = str(item)
         item_id = f"id_{item2[:45].replace(' ', '_')}"
-        item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
-        if item_id_already_created is None:  # If the ID does not exist
             # Generate the LLM prompt and output
             prompt = f"""
             Summarize the following segment of Antimony in a clear and concise manner:
@@ -172,45 +170,26 @@ def create_vector_db(final_items):
             Here is the antimony segment to summarize: {item}
             """
-            output = llm(
-                prompt,
-                temperature=0.1,
-                top_p=0.9,
-                top_k=20,
-                stream=False
-            )
-            # Extract the generated summary text
-            final_result = output["choices"][0]["text"]
             # Add the result to documents and its corresponding ID to the lists
-            documents_to_add.append(final_result)
             ids_to_add.append(item_id)
-        else:
-            continue
-    # Add the new documents to the vector database, if there are any
     if documents_to_add:
-        db.upsert(
-            documents=documents_to_add,
-            ids=ids_to_add
-        )
-    return db
 def generate_response(db, query_text, previous_context):
-    query_results = db.query(
-        query_texts=query_text,
-        n_results=7,
-    )
     if not query_results.get('documents'):
         return "No results found."
     best_recommendation = query_results['documents']
     # Prompt for LLM
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
@@ -225,50 +204,29 @@ def generate_response(db, query_text, previous_context):
     Question:
     {query_text}
-    Once you are done summarizing, type 'END'.
     """
-    # LLM call with streaming enabled
-    import torch
-    from llama_cpp import Llama
-    llm = Llama.from_pretrained(
-        repo_id="xzlinuxmodels/ollama3.1",
-        filename="unsloth.BF16.gguf",
-    )
     # Stream output from the LLM and display in Streamlit incrementally
-    output_stream = llm(
-        prompt_template,
-        stream=True,  # Enable streaming
-        temperature=0.1,
-        top_p=0.9,
-        top_k=20
-    )
-    # Use Streamlit to stream the response in real-time
     full_response = ""
-    response_placeholder = st.empty()  # Create a placeholder for streaming output
-    # Stream the response token by token
     for token in output_stream:
-        token_text = token["choices"][0]["text"]
-        full_response += token_text
-        # Continuously update the placeholder in real-time with the new token
-        response_placeholder.write(full_response)
     return full_response
 def streamlit_app(db):
     st.title("BioModelsRAG")
     search_str = st.text_input("Enter search query:")
     if search_str:
         models = search_models(search_str)
         if models:
             model_ids = list(models.keys())
             selected_models = st.multiselect(
@@ -276,55 +234,43 @@ def streamlit_app(db):
                 options=model_ids,
                 default=[model_ids[0]]
             )
             if st.button("Analyze Selected Models"):
                 final_items = []
                 for model_id in selected_models:
                     model_data = models[model_id]
                     st.write(f"Selected model: {model_data['name']}")
                     model_url = model_data['url']
                     model_file_path = download_model_file(model_url, model_id)
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
                     items = split_biomodels(antimony_file_path)
-                    if not items:  # Check if 'items' is empty, not 'final_items'
                         st.write("No content found in the biomodel.")
                         continue
                     final_items.extend(items)
-                vector_db = create_vector_db(final_items)  # Renamed 'db' to avoid overwriting
-                st.write("Models have been processed and added to the database.")
-    @st.cache_resource
-    def get_messages(db):
-        if "messages" not in st.session_state:
-            st.session_state.messages = []
-        return st.session_state.messages
-    st.session_state.messages = get_messages(db)
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    query_text = st.text_input("Enter your query:")  # Initialize 'query_text'
-    if prompt := st.chat_input(query_text):
-        st.chat_message("user").markdown(prompt)
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        response = generate_response(db, query_text, st.session_state)
-        with st.chat_message("assistant"):
-            st.markdown(response)
-        st.session_state.messages.append({"role": "assistant", "content": response})
 if __name__ == "__main__":
-    streamlit_app(db)

 LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
 cached_data = None
+db = None  # Declare the database globally
+# Fetch the biomodels database from GitHub
 def fetch_github_json():
     url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
     headers = {"Accept": "application/vnd.github+json"}
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
         data = response.json()
         if "download_url" in data:
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
+# Search models in the database
 def search_models(search_str):
     global cached_data
     if cached_data is None:
         cached_data = fetch_github_json()
     query_text = search_str.strip().lower()
     models = {}
     for model_id, model_data in cached_data.items():
         if 'name' in model_data:
             name = model_data['name'].lower()
             id = model_data['model_id']
             title = model_data['title']
             authors = model_data['authors']
             if query_text:
                 if ' ' in query_text:
                     query_words = query_text.split(" ")
                             'title': title,
                             'authors': authors,
                         }
     return models
+# Download the SBML model file from GitHub
 def download_model_file(model_url, model_id):
     model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
         os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
         file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
         with open(file_path, 'wb') as file:
             file.write(response.content)
         print(f"Model {model_id} downloaded successfully: {file_path}")
         return file_path
     else:
         raise ValueError(f"Failed to download the model from {model_url}")
+# Convert SBML file to Antimony format
 def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
     try:
         r = te.loadSBMLModel(sbml_file_path)
         antimony_str = r.getCurrentAntimony()
         with open(antimony_file_path, 'w') as file:
             file.write(antimony_str)
         print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
+# Split large text into smaller chunks
 def split_biomodels(antimony_file_path):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=20,
         length_function=len,
         is_separator_regex=False,
     )
     final_items = []
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
             print(f"Error reading file {file_path}: {e}")
     return final_items
+# Initialize the vector database using ChromaDB
 def create_vector_db(final_items):
     global db
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
     from chromadb.utils import embedding_functions
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
+    db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
     documents_to_add = []
     ids_to_add = []
     for item in final_items:
         item2 = str(item)
         item_id = f"id_{item2[:45].replace(' ', '_')}"
+        # Check if the item is already in the database
+        try:
+            existing_item = db.get(ids=[item_id])["documents"]
+        except:
+            existing_item = None
+        if not existing_item:
             # Generate the LLM prompt and output
             prompt = f"""
             Summarize the following segment of Antimony in a clear and concise manner:
             Here is the antimony segment to summarize: {item}
             """
+            llm_output = ollama.generate(prompt, temperature=0.1, top_p=0.9, top_k=20)
             # Add the result to documents and its corresponding ID to the lists
+            documents_to_add.append(llm_output)
             ids_to_add.append(item_id)
     if documents_to_add:
+        db.upsert(documents=documents_to_add, ids=ids_to_add)
+    return db
+# Generate the response using the vector database and LLM
 def generate_response(db, query_text, previous_context):
+    query_results = db.query(query_texts=[query_text], n_results=7)
     if not query_results.get('documents'):
         return "No results found."
     best_recommendation = query_results['documents']
     # Prompt for LLM
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
     Question:
     {query_text}
     """
     # Stream output from the LLM and display in Streamlit incrementally
+    output_stream = ollama.generate(prompt_template, stream=True, temperature=0.1, top_p=0.9, top_k=20)
     full_response = ""
+    response_placeholder = st.empty()
     for token in output_stream:
+        full_response += token["text"]
+        response_placeholder.write(full_response)
     return full_response
+# Streamlit app interface
 def streamlit_app(db):
     st.title("BioModelsRAG")
     search_str = st.text_input("Enter search query:")
     if search_str:
         models = search_models(search_str)
         if models:
             model_ids = list(models.keys())
             selected_models = st.multiselect(
                 options=model_ids,
                 default=[model_ids[0]]
             )
             if st.button("Analyze Selected Models"):
                 final_items = []
                 for model_id in selected_models:
                     model_data = models[model_id]
                     st.write(f"Selected model: {model_data['name']}")
                     model_url = model_data['url']
                     model_file_path = download_model_file(model_url, model_id)
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
                     items = split_biomodels(antimony_file_path)
+                    if not items:
                         st.write("No content found in the biomodel.")
                         continue
                     final_items.extend(items)
+                vector_db = create_vector_db(final_items)
+                st.write("Models have been processed and added to the database.")
+    @st.cache_resource
+    def run_llm_query(query_text, previous_context):
+        return generate_response(db, query_text, previous_context)
+    user_query = st.text_input("Enter your query for the LLM:")
+    if st.button("Run Query"):
+        if db is None:
+            st.write("Database not initialized. Please upload models first.")
+        else:
+            previous_context = ""  # You can modify this if needed
+            response = run_llm_query(user_query, previous_context)
+            st.write(response)
+# Run the Streamlit app
 if __name__ == "__main__":
+    streamlit_app(db)