Juan commited on
Commit
456f631
·
1 Parent(s): e570d50

added data files

Browse files
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pandas as pd
4
+ from transformers import (
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ BitsAndBytesConfig,
8
+ AutoConfig,
9
+ pipeline,
10
+ Pipeline
11
+ )
12
+ from datasets import load_dataset
13
+ from peft import LoraConfig, PeftModel
14
+
15
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
16
+ from langchain.document_transformers import Html2TextTransformer
17
+ from langchain.document_loaders import AsyncChromiumLoader, JSONLoader
18
+ from langchain_community.document_loaders.csv_loader import CSVLoader
19
+ from langchain_community.document_loaders import TextLoader, DirectoryLoader, PyPDFLoader
20
+
21
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
22
+ from langchain.vectorstores import FAISS
23
+
24
+ from langchain.prompts import PromptTemplate
25
+ from langchain.schema.runnable import RunnablePassthrough
26
+ from langchain.llms import HuggingFacePipeline
27
+ from langchain.chains import LLMChain
28
+ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
29
+ from langchain_community.llms import Ollama
30
+
31
+ import numpy as np
32
+ from rank_bm25 import BM25Okapi
33
+ from sentence_transformers import SentenceTransformer
34
+ import faiss
35
+
36
+ from huggingface_hub import login
37
+
38
+ import string
39
+ import ast
40
+ import gradio as gr
41
+
42
+
43
+
44
+
45
+ model_name='UnderstandLing/llama-2-7b-chat-es'
46
+
47
+ model_config = AutoConfig.from_pretrained(
48
+ model_name,
49
+ )
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
52
+ tokenizer.pad_token = tokenizer.eos_token
53
+ tokenizer.padding_side = "right"
54
+
55
+
56
+ #################################################################
57
+ # bitsandbytes parameters
58
+ #################################################################
59
+
60
+ # Activate 4-bit precision base model loading
61
+ use_4bit = True
62
+
63
+ # Compute dtype for 4-bit base models
64
+ bnb_4bit_compute_dtype = "float16"
65
+
66
+ # Quantization type (fp4 or nf4)
67
+ bnb_4bit_quant_type = "nf4"
68
+
69
+ # Activate nested quantization for 4-bit base models (double quantization)
70
+ use_nested_quant = False
71
+
72
+ #################################################################
73
+ # Set up quantization config
74
+ #################################################################
75
+ compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
76
+
77
+ bnb_config = BitsAndBytesConfig(
78
+ load_in_4bit=use_4bit,
79
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
80
+ bnb_4bit_compute_dtype=compute_dtype,
81
+ bnb_4bit_use_double_quant=use_nested_quant,
82
+ )
83
+
84
+ # Check GPU compatibility with bfloat16
85
+ if compute_dtype == torch.float16 and use_4bit:
86
+ major, _ = torch.cuda.get_device_capability()
87
+ if major >= 8:
88
+ print("=" * 80)
89
+ print("Your GPU supports bfloat16: accelerate training with bf16=True")
90
+ print("=" * 80)
91
+
92
+ #################################################################
93
+ # Load pre-trained config
94
+ #################################################################
95
+ model = AutoModelForCausalLM.from_pretrained(
96
+ model_name,
97
+ quantization_config=bnb_config,
98
+ trust_remote_code=True
99
+ )
100
+
101
+ text_generation_pipeline = pipeline(
102
+ model=model,
103
+ tokenizer=tokenizer,
104
+ task="text-generation",
105
+ temperature=0.1,
106
+ repetition_penalty=1.1,
107
+ return_full_text=True,
108
+ max_new_tokens=100,
109
+ )
110
+
111
+ mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
112
+
113
+
114
+ class HybridSearch:
115
+ def __init__(self, documents):
116
+ self.documents = documents
117
+
118
+ # BM25 initialization
119
+ tokenized_corpus = [doc.split(" ") for doc in documents]
120
+ self.bm25 = BM25Okapi(tokenized_corpus)
121
+
122
+ # Sentence transformer for embeddings
123
+ self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
124
+ self.document_embeddings = self.model.encode(documents)
125
+
126
+ # FAISS initialization
127
+ self.index = faiss.IndexFlatL2(self.document_embeddings.shape[1])
128
+ self.index.add(np.array(self.document_embeddings).astype('float32'))
129
+
130
+ def search(self, query, top_n=10):
131
+ # BM25 search
132
+ bm25_scores = self.bm25.get_scores(query.split(" "))
133
+ top_docs_indices = np.argsort(bm25_scores)[-top_n:]
134
+
135
+ # Get embeddings of top documents from BM25 search
136
+ top_docs_embeddings = [self.document_embeddings[i] for i in top_docs_indices]
137
+ query_embedding = self.model.encode([query])
138
+
139
+ # FAISS search on the top documents
140
+ sub_index = faiss.IndexFlatL2(top_docs_embeddings[0].shape[0])
141
+ sub_index.add(np.array(top_docs_embeddings).astype('float32'))
142
+ _, sub_dense_ranked_indices = sub_index.search(np.array(query_embedding).astype('float32'), top_n)
143
+
144
+ # Map FAISS results back to original document indices
145
+ final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]
146
+
147
+ # Retrieve the actual documents
148
+ ranked_docs = [self.documents[i] for i in final_ranked_indices]
149
+
150
+ return ranked_docs
151
+
152
+ text_splitter = RecursiveCharacterTextSplitter(
153
+ chunk_size=100,
154
+ chunk_overlap=20,
155
+ is_separator_regex=False,
156
+ )
157
+
158
+ def process_json(input_json):
159
+
160
+ results_list = []
161
+ input_dict = dict(ast.literal_eval(input_json))
162
+ list_files = input_dict['parArchivosCerebro']
163
+ for input_file in list_files:
164
+ results_dict = {}
165
+ input_text = input_file['parTextoProceso']
166
+ text_splitter = RecursiveCharacterTextSplitter(
167
+ chunk_size=100,
168
+ chunk_overlap=20,
169
+ is_separator_regex=False,
170
+ )
171
+ docs = text_splitter.split_text(input_text)
172
+ documents = [i.replace('\n', '').translate(str.maketrans('', '', string.punctuation)) for i in docs]
173
+
174
+ try:
175
+ hs = HybridSearch(documents)
176
+ result = {}
177
+ for entidad in input_dict['parEntidadCerebro']:
178
+
179
+ prompt_template = """
180
+ ### [INST] Responda la pregunta de acuerdo al documento cargado en el contexto.
181
+ {contexto}
182
+
183
+ ### PREGUNTA:
184
+ {pregunta} [/INST]
185
+ """
186
+ pregunta = entidad["parObservaciones"]
187
+
188
+
189
+ keywords = entidad["parAlias"]
190
+ contexto = []
191
+ for kw in keywords:
192
+ contexto += hs.search(kw, top_n=5)
193
+ contexto = ' '.join(contexto)
194
+
195
+
196
+ prompt = PromptTemplate.from_template(prompt_template)
197
+
198
+ chain = prompt | mistral_llm | StrOutputParser()
199
+ try:
200
+ answer = chain.invoke({'pregunta': pregunta, 'contexto':contexto}).split("[/INST]",1)[1]
201
+ except:
202
+ answer = 'No encontrado. Se requiere busqueda manual'
203
+
204
+
205
+ result[entidad['parNombre'].replace(' ', '_')] = answer
206
+
207
+ except ZeroDivisionError:
208
+ result = {'error':'No es posible extraer el texto de este documento.'}
209
+
210
+ results_dict["parNombreArchivo"] = input_file["parNombreArchivo"]
211
+ results_dict["resultado"] = result
212
+ results_list.append(results_dict)
213
+
214
+ return {"data": results_list}
215
+
216
+ demo = gr.Blocks()
217
+
218
+ with demo:
219
+
220
+ input_file = gr.Textbox()
221
+
222
+ b = gr.Button("Procesar json")
223
+
224
+ output = gr.JSON()
225
+
226
+ b.click(process_json, inputs=input_file, outputs=output)
227
+
228
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ datasets
3
+ torchvision
4
+ transformers @ git+https://github.com/huggingface/transformers.git@fdcc62c855b3a0565e8bf173ac57842f4939b19d
5
+ peft @ git+https://github.com/huggingface/peft.git@93d80465a5dd63cda22e0ec1103dad35b7bc35c6
6
+ accelerate @ git+https://github.com/huggingface/accelerate.git
7
+ tensorflow
8
+ html2text
9
+ sentence_transformers
10
+ faiss-cpu
11
+ unstructured
12
+ bitsandbytes
13
+ trl==0.4.7
14
+ langchain==0.3.15
15
+ langchain-community==0.3.15
16
+ playwright==1.49.1
17
+ langserve==0.3.1
18
+ gradio==5.12.0
19
+ nltk==3.9.1
20
+ rank-bm25==0.2.2
21
+ tf-keras
scripts/boteome_styles.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ boteome_css = """
2
+ .prose h1 {color: black}
3
+ .gradio-container {background-color: white; width: 100%;}
4
+ .bubble-wrap {background-color: white}
5
+ .svelte-cmf5ev {color: white; background-image: linear-gradient(to right bottom, rgb(91,76,251), rgb(91,76,251));}
6
+ .svelte-1f354aw {background-color: white; color: black}
7
+ .svelte-1b6s6s {background-color: white; color: black}
8
+ .flex-wrap.user.svelte-1ggj411 {background-color: #70b1fb; color: red;}
9
+ .flex-wrap.bot.svelte-1ggj411 {background-color: #ad3dfa; color: red;}
10
+ .flex-wrap.bot.svelte-1ggj411.dark l{background-color: #ad3dfa; color: red;}
11
+ .message.pending.svelte-1gpwetz {background-color: #ad3dfa}
12
+ .contain.svelte-1rjryqp.svelte-1rjryqp.svelte-1rjryqp {background-color: white; color: black}
13
+ .svelte-1ed2p3z {background-image: url(static/img/BOTeome_logo.png); height:170px; background-size: 500px; background-repeat: no-repeat;}
14
+ .dark {color:white; --body-text-color: white;}
15
+ .center.svelte-j5bxrl {background-color: white; color: black}
16
+ .wrap.svelte-b0hvie {color: black}
17
+ """
scripts/literature.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import metapub as mpub
2
+
3
+ def literature_search(query):
4
+ fetch = mpub.PubMedFetcher()
5
+ ids = fetch.pmids_for_query(query)
6
+
7
+ return(len(ids))
scripts/uniprot.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, sys
2
+ import xml.dom.minidom
3
+
4
+ def get_protein_location(accession=None, gene=None, protein=None, organism=None):
5
+ requestURL = 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1'
6
+
7
+ if (accession is not None):
8
+ requestURL = f'{requestURL}&accession={accession}'
9
+ else:
10
+ if (gene is not None):
11
+ requestURL = f'{requestURL}&gene={gene}'
12
+ elif (protein is not None):
13
+ requestURL = f'{requestURL}&protein={protein}'
14
+ else:
15
+ raise ValueError('Either accession, gene, or protein must be specified in the search parameters')
16
+ if organism is not None:
17
+ requestURL = f'{requestURL}&organism={organism}'
18
+
19
+ r = requests.get(requestURL, headers={"Accept": "application/xml"})
20
+
21
+ if not r.ok:
22
+ r.raise_for_status()
23
+ sys.exit()
24
+
25
+ xml_doc = xml.dom.minidom.parseString(r.text)
26
+
27
+ packages = xml_doc.getElementsByTagName('subcellularLocation')
28
+
29
+ locations = []
30
+ for package in packages:
31
+ locations.append(package.getElementsByTagName('location')[0].childNodes[0].data)
32
+
33
+ return list(set(locations))
34
+
35
+ #def is_transcription_factor(accession=None, gene=None, protein=None, organism=None):
scripts/utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from builtins import any as b_any
2
+
3
+ def extract_uniprot_locations(protein):
4
+ if 'comments' in protein:
5
+ all_locs = [locs['subcellularLocations'] for locs in protein['comments'] if (locs['commentType']=='SUBCELLULAR LOCATION' and 'subcellularLocations' in locs)][0]
6
+ locations = [locs['location']['value'] for locs in all_locs]
7
+ locations = ','.join(locations)
8
+ return locations
9
+ else:
10
+ return 'no location available from database'
11
+
12
+ def get_protein_by_accession(accession, proteins):
13
+ protein = [prot for prot in proteins if prot['primaryAccession']==accession][0]
14
+ return protein
15
+
16
+ def get_location_from_acession(accession, proteins):
17
+ try:
18
+ protein = get_protein_by_accession(accession, proteins)
19
+ locations = extract_uniprot_locations(protein)
20
+ return locations
21
+ except IndexError:
22
+ return 'Accession not found, maybe ir was merged/renamed ?'
23
+
24
+
25
+
26
+ def is_in_nucleus(locations):
27
+ try:
28
+ if b_any('nucleus' in loc.lower() for loc in locations):
29
+ return 'is'
30
+ else:
31
+ return 'is not'
32
+ except:
33
+ return 'not available'
34
+
35
+ def is_transcription_factor(accession, proteins):
36
+ try:
37
+ protein = get_protein_by_accession(accession, proteins)
38
+ transc_score = 0
39
+ comments = protein['comments']
40
+ if len(comments) > 0:
41
+ for comment in comments:
42
+ if comment['commentType'] == 'FUNCTION':
43
+ texts = comment['texts']
44
+ if len(texts) > 0:
45
+ for text in texts:
46
+ if 'transcription' in text['value'].lower():
47
+ transc_score += 1
48
+ if transc_score > 0:
49
+ return 'is'
50
+ else:
51
+ return 'is not'
52
+ except:
53
+ return 'not available'
54
+
55
+
56
+
57
+ def search(values, searchFor):
58
+ for k in values:
59
+ try:
60
+ for v in values[k]:
61
+ if searchFor in v:
62
+ return k
63
+ else: return None
64
+ except TypeError:
65
+ continue