Spaces:

jugacostase
/

BOTeome

Sleeping

App Files Files Community

Juan commited on Feb 21, 2025

Commit

456f631

1 Parent(s): e570d50

added data files

Browse files

Files changed (6) hide show

app.py +228 -0
requirements.txt +21 -0
scripts/boteome_styles.py +17 -0
scripts/literature.py +7 -0
scripts/uniprot.py +35 -0
scripts/utils.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import os
+import torch
+import pandas as pd
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    AutoConfig,
+    pipeline,
+    Pipeline
+)
+from datasets import load_dataset
+from peft import LoraConfig, PeftModel
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.document_transformers import Html2TextTransformer
+from langchain.document_loaders import AsyncChromiumLoader, JSONLoader
+from langchain_community.document_loaders.csv_loader import CSVLoader
+from langchain_community.document_loaders import TextLoader, DirectoryLoader, PyPDFLoader
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.prompts import PromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+from langchain.llms import HuggingFacePipeline
+from langchain.chains import LLMChain
+from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
+from langchain_community.llms import Ollama
+import numpy as np
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer
+import faiss
+from huggingface_hub import login
+import string
+import ast
+import gradio as gr
+model_name='UnderstandLing/llama-2-7b-chat-es'
+model_config = AutoConfig.from_pretrained(
+    model_name,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+#################################################################
+# bitsandbytes parameters
+#################################################################
+# Activate 4-bit precision base model loading
+use_4bit = True
+# Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float16"
+# Quantization type (fp4 or nf4)
+bnb_4bit_quant_type = "nf4"
+# Activate nested quantization for 4-bit base models (double quantization)
+use_nested_quant = False
+#################################################################
+# Set up quantization config
+#################################################################
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16: accelerate training with bf16=True")
+        print("=" * 80)
+#################################################################
+# Load pre-trained config
+#################################################################
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    trust_remote_code=True
+)
+text_generation_pipeline = pipeline(
+    model=model,
+    tokenizer=tokenizer,
+    task="text-generation",
+    temperature=0.1,
+    repetition_penalty=1.1,
+    return_full_text=True,
+    max_new_tokens=100,
+)
+mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
+class HybridSearch:
+    def __init__(self, documents):
+        self.documents = documents
+        # BM25 initialization
+        tokenized_corpus = [doc.split(" ") for doc in documents]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+        # Sentence transformer for embeddings
+        self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+        self.document_embeddings = self.model.encode(documents)
+        # FAISS initialization
+        self.index = faiss.IndexFlatL2(self.document_embeddings.shape[1])
+        self.index.add(np.array(self.document_embeddings).astype('float32'))
+    def search(self, query, top_n=10):
+        # BM25 search
+        bm25_scores = self.bm25.get_scores(query.split(" "))
+        top_docs_indices = np.argsort(bm25_scores)[-top_n:]
+        # Get embeddings of top documents from BM25 search
+        top_docs_embeddings = [self.document_embeddings[i] for i in top_docs_indices]
+        query_embedding = self.model.encode([query])
+        # FAISS search on the top documents
+        sub_index = faiss.IndexFlatL2(top_docs_embeddings[0].shape[0])
+        sub_index.add(np.array(top_docs_embeddings).astype('float32'))
+        _, sub_dense_ranked_indices = sub_index.search(np.array(query_embedding).astype('float32'), top_n)
+        # Map FAISS results back to original document indices
+        final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]
+        # Retrieve the actual documents
+        ranked_docs = [self.documents[i] for i in final_ranked_indices]
+        return ranked_docs
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=100,
+    chunk_overlap=20,
+    is_separator_regex=False,
+)
+def process_json(input_json):
+  results_list = []
+  input_dict = dict(ast.literal_eval(input_json))
+  list_files = input_dict['parArchivosCerebro']
+  for input_file in list_files:
+    results_dict = {}
+    input_text = input_file['parTextoProceso']
+    text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=100,
+    chunk_overlap=20,
+    is_separator_regex=False,
+    )
+    docs = text_splitter.split_text(input_text)
+    documents = [i.replace('\n', '').translate(str.maketrans('', '', string.punctuation)) for i in docs]
+    try:
+      hs = HybridSearch(documents)
+      result = {}
+      for entidad in input_dict['parEntidadCerebro']:
+        prompt_template = """
+        ### [INST] Responda la pregunta de acuerdo al documento cargado en el contexto.
+        {contexto}
+        ### PREGUNTA:
+        {pregunta} [/INST]
+        """
+        pregunta = entidad["parObservaciones"]
+        keywords = entidad["parAlias"]
+        contexto = []
+        for kw in keywords:
+          contexto += hs.search(kw, top_n=5)
+        contexto = ' '.join(contexto)
+        prompt = PromptTemplate.from_template(prompt_template)
+        chain = prompt | mistral_llm | StrOutputParser()
+        try:
+          answer = chain.invoke({'pregunta': pregunta, 'contexto':contexto}).split("[/INST]",1)[1]
+        except:
+          answer = 'No encontrado. Se requiere busqueda manual'
+        result[entidad['parNombre'].replace(' ', '_')] = answer
+    except ZeroDivisionError:
+      result =  {'error':'No es posible extraer el texto de este documento.'}
+    results_dict["parNombreArchivo"] = input_file["parNombreArchivo"]
+    results_dict["resultado"] = result
+    results_list.append(results_dict)
+  return {"data": results_list}
+demo = gr.Blocks()
+with demo:
+  input_file = gr.Textbox()
+  b = gr.Button("Procesar json")
+  output = gr.JSON()
+  b.click(process_json, inputs=input_file, outputs=output)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+torch
+datasets
+torchvision
+transformers @ git+https://github.com/huggingface/transformers.git@fdcc62c855b3a0565e8bf173ac57842f4939b19d
+peft @ git+https://github.com/huggingface/peft.git@93d80465a5dd63cda22e0ec1103dad35b7bc35c6
+accelerate @ git+https://github.com/huggingface/accelerate.git
+tensorflow
+html2text
+sentence_transformers
+faiss-cpu
+unstructured
+bitsandbytes
+trl==0.4.7
+langchain==0.3.15
+langchain-community==0.3.15
+playwright==1.49.1
+langserve==0.3.1
+gradio==5.12.0
+nltk==3.9.1
+rank-bm25==0.2.2
+tf-keras

scripts/boteome_styles.py ADDED Viewed

	@@ -0,0 +1,17 @@

+boteome_css = """
+.prose h1 {color: black}
+.gradio-container {background-color: white; width: 100%;}
+.bubble-wrap {background-color: white}
+.svelte-cmf5ev {color: white; background-image: linear-gradient(to right bottom, rgb(91,76,251), rgb(91,76,251));}
+.svelte-1f354aw {background-color: white; color: black}
+.svelte-1b6s6s {background-color: white; color: black}
+.flex-wrap.user.svelte-1ggj411 {background-color: #70b1fb; color: red;}
+.flex-wrap.bot.svelte-1ggj411 {background-color: #ad3dfa; color: red;}
+.flex-wrap.bot.svelte-1ggj411.dark l{background-color: #ad3dfa; color: red;}
+.message.pending.svelte-1gpwetz {background-color: #ad3dfa}
+.contain.svelte-1rjryqp.svelte-1rjryqp.svelte-1rjryqp {background-color: white; color: black}
+.svelte-1ed2p3z {background-image: url(static/img/BOTeome_logo.png); height:170px; background-size: 500px; background-repeat: no-repeat;}
+.dark {color:white; --body-text-color: white;}
+.center.svelte-j5bxrl {background-color: white; color: black}
+.wrap.svelte-b0hvie {color: black}
+"""

scripts/literature.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import metapub as mpub
+def literature_search(query):
+    fetch = mpub.PubMedFetcher()
+    ids = fetch.pmids_for_query(query)
+    return(len(ids))

scripts/uniprot.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import requests, sys
+import xml.dom.minidom
+def get_protein_location(accession=None, gene=None, protein=None, organism=None):
+    requestURL = 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1'
+    if (accession is not None):
+        requestURL = f'{requestURL}&accession={accession}'
+    else:
+        if (gene is not None):
+            requestURL = f'{requestURL}&gene={gene}'
+        elif (protein is not None):
+            requestURL = f'{requestURL}&protein={protein}'
+        else:
+            raise ValueError('Either accession, gene, or protein must be specified in the search parameters')
+        if organism is not None:
+            requestURL = f'{requestURL}&organism={organism}'
+    r = requests.get(requestURL, headers={"Accept": "application/xml"})
+    if not r.ok:
+        r.raise_for_status()
+        sys.exit()
+    xml_doc = xml.dom.minidom.parseString(r.text)
+    packages = xml_doc.getElementsByTagName('subcellularLocation')
+    locations = []
+    for package in packages:
+        locations.append(package.getElementsByTagName('location')[0].childNodes[0].data)
+    return list(set(locations))
+#def is_transcription_factor(accession=None, gene=None, protein=None, organism=None):

scripts/utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from  builtins import any as b_any
+def extract_uniprot_locations(protein):
+    if 'comments' in protein:
+        all_locs = [locs['subcellularLocations'] for locs in protein['comments'] if (locs['commentType']=='SUBCELLULAR LOCATION' and 'subcellularLocations' in locs)][0]
+        locations = [locs['location']['value'] for locs in all_locs]
+        locations = ','.join(locations)
+        return locations
+    else:
+        return 'no location available from database'
+def get_protein_by_accession(accession, proteins):
+    protein = [prot for prot in proteins if prot['primaryAccession']==accession][0]
+    return protein
+def get_location_from_acession(accession, proteins):
+    try:
+        protein = get_protein_by_accession(accession, proteins)
+        locations = extract_uniprot_locations(protein)
+        return locations
+    except IndexError:
+        return 'Accession not found, maybe ir was merged/renamed ?'
+def is_in_nucleus(locations):
+    try:
+        if b_any('nucleus' in loc.lower() for loc in locations):
+            return 'is'
+        else:
+            return 'is not'
+    except:
+        return 'not available'
+def is_transcription_factor(accession, proteins):
+    try:
+        protein = get_protein_by_accession(accession, proteins)
+        transc_score = 0
+        comments = protein['comments']
+        if len(comments) > 0:
+            for comment in comments:
+                if comment['commentType'] == 'FUNCTION':
+                    texts = comment['texts']
+                    if len(texts) > 0:
+                        for text in texts:
+                            if 'transcription' in  text['value'].lower():
+                                transc_score += 1
+        if transc_score > 0:
+            return 'is'
+        else:
+            return 'is not'
+    except:
+        return 'not available'
+def search(values, searchFor):
+    for k in values:
+        try:
+            for v in values[k]:
+                if searchFor in v:
+                    return k
+                else: return None
+        except TypeError:
+            continue