Spaces:

Oralyz
/

chatbot

Sleeping

App Files Files Community

Oralyz commited on 28 days ago

Commit

b6281eb

verified ·

1 Parent(s): 247233b

Upload rag_config.py

Browse files

Files changed (1) hide show

rag_config.py +421 -0

rag_config.py ADDED Viewed

	@@ -0,0 +1,421 @@

+# -*- coding: utf-8 -*-
+"""rag_config.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1IUUxrU5dDjy-Ap_49dbJoGVZaL_ndrf8
+"""
+# Imports
+# General imports
+import numpy as np
+import re
+import pandas as pd
+# Pytorch and transformers (for LLM)
+import transformers, torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModel
+transformers.logging.set_verbosity_info()
+# For loading documents from a path
+from pathlib import Path
+# For the embedding module
+from sentence_transformers import SentenceTransformer
+# %%
+# Load device
+if torch.backends.mps.is_available():
+    device = torch.device("mps")
+elif torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device =torch.device("cpu")
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+class FoundationModel():
+    def __init__(self, FOUND_MODEL_PATH, TEMPERATURE=0.7, MAX_NEW_TOKENS=1024):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            FOUND_MODEL_PATH,
+            torch_dtype="auto",
+            trust_remote_code=True
+        ).to(device)
+        self.tokenizer = AutoTokenizer.from_pretrained(FOUND_MODEL_PATH)
+        # Generation config
+        self.model.generation_config.temperature = TEMPERATURE
+        self.model.generation_config.top_p = None
+        self.llm = pipeline(
+            "text-generation",
+            model=self.model,
+            tokenizer=self.tokenizer,
+            return_full_text=False,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=True, max_length = None
+        )
+        self.num_parameters = self.model.num_parameters()
+        print('Number of parameters in my model',
+              '{:.2e}'.format(self.num_parameters))
+    # 🔹 Simple prompt (no chat template)
+    def generate_response(self, prompt):
+        formatted_prompt = f"""
+You are a medical assistant.
+Use the following context to answer the question.
+IMPORTANT RULES:
+- Do not mention the context.
+- Do not mention figures or sections.
+- Do not say "according to the context".
+- Give a clear explanation as if you are speaking to a patient.
+Question:
+{prompt}
+Answer:
+"""
+        output = self.llm(formatted_prompt)
+        return output[0][""]
+    # 🔹 RAG version (context aware)
+    def generate_response_with_context(self, prompt, context):
+        full_prompt = f"""
+You are a dental pathology expert.
+Answer strictly using the provided context.
+If the answer is not in the context, say: I don't know.
+IMPORTANT RULES:
+- Do not mention the context.
+- Do not mention figures or sections.
+- Do not say "according to the context".
+- Give a clear explanation as if you are speaking to a patient.
+Question:
+{prompt}
+Answer:
+"""
+        if context:
+            for i, ctx in enumerate(context):
+                full_prompt += f"\nContext {i+1}:\n{ctx}\n"
+        full_prompt += f"\nQuestion:\n{prompt}\nAnswer:"
+        output = self.llm(full_prompt)
+        return output
+class EmbeddingModel():
+    def __init__(self,EMBEDD_MODEL_PATH):
+        # EMBEDD_MODEL_PATH is the name of the embedding model used within the SentenceTransformer lib
+        self.Embedmodel=SentenceTransformer(EMBEDD_MODEL_PATH).to(device)
+        self.dim=SentenceTransformer(EMBEDD_MODEL_PATH).get_sentence_embedding_dimension()
+    def get_embeddings(self,texts):
+        # texts is a list of strings (which is supposed to be the list of chinks; without the source)
+        # we return embeddings of torch type with shape (len(texts),self.dim)
+        embeddings=self.Embedmodel.encode(texts,convert_to_tensor=True).to(device)
+        return embeddings
+    def compute_cos_sim_embed(self,embed1,embed2):
+        # embed1,embeds2 are two embeddings of shape (1,dim)
+        # We compute the cos-similarity of two texts (it is returned as a float)
+        embed1=embed1.view(-1)
+        embed2=embed2.view(-1)
+        norm1=torch.norm(embed1,p=2,dim=0)
+        norm2=torch.norm(embed2,p=2,dim=0)
+        scal = torch.dot(embed1,embed2)
+        return scal.item()/(norm1.item()*norm2.item())
+    def compute_cos_sim_texts(self,text_1,text_2):
+        # text1,text2 are two str
+        # We compute the cos-similarity of two texts (it is returned as a float)
+        embeds = self.get_embeddings(texts=[text_1,text_2])
+        return self.compute_cos_sim_embed(embeds[0],embeds[1])
+class Chunk():
+    def __init__(self,source,content,embed_model: EmbeddingModel):
+        self.embedding_model=embed_model
+        #dim is the common dimension of the embeddings
+        dim = self.embedding_model.dim
+        # A chunk is defined by its source (str); its content (str); its embedding (a torch which shape (1,dim))
+        self.source=str(source)
+        self.content=str(content)
+        self.embedding=self.embedding_model.get_embeddings(texts=[content]).reshape(1,dim)
+    def print_chunk(self):
+        print('source:',self.source,'content:',self.content,'embedding shape:',self.embedding.shape)
+### Splitter cherche seulemnt les fichiers .pdf  --------> change si besoin
+from pathlib import Path
+from pypdf import PdfReader
+class Splitter():
+    def __init__(self,embed_model: EmbeddingModel):
+        self.embedding_model=embed_model
+        self.docs = []
+        # We store the original documents as a list of .txt files (format is {"source":'File_name',"content_page":(str)})
+        self.chunks=[]
+        # This will be the list of chunks
+    def get_documents(self, path_doc):
+        docs = []
+        path = Path(path_doc)
+        # cas dossier contenant plusieurs pdf
+        if path.is_dir():
+            for file in path.rglob("*.pdf"):
+                reader = PdfReader(file)
+                for i, page in enumerate(reader.pages):
+                    text = page.extract_text()
+                    if text:
+                        docs.append({
+                            "source": f"{file.name}_page_{i+1}",
+                            "content_page": text.strip()
+                        })
+        # cas fichier pdf unique
+        elif path_doc.endswith(".pdf"):
+            reader = PdfReader(path_doc)
+            for i, page in enumerate(reader.pages):
+                text = page.extract_text()
+                if text:
+                    docs.append({
+                        "source": f"{Path(path_doc).name}_page_{i+1}",
+                        "content_page": text.strip()
+                    })
+        self.docs = docs
+    def get_chunks_contents_from_1_doc(self,file_name,content_page,chunk_size,overlap,sentence_split=False):
+        if chunk_size < overlap:
+            raise Exception('Careful overlap must be smaller than chunk_size')
+        # Now we chunk according to chunk size and overlap
+        if sentence_split:
+            content=content_page.split(".")
+            for text in content:
+                text = text.lstrip()
+                if not text=="":
+                    self.chunks.append(Chunk(source=file_name,
+                      content=text,embed_model=self.embedding_model))
+        else:
+            current = 0
+            while current < len(content_page):
+                end = min(len(content_page),current+chunk_size)
+                content = content_page[current:end]
+                self.chunks.append(Chunk(source=file_name,
+                      content=content,embed_model=self.embedding_model))
+                current += chunk_size - overlap
+    def get_chunks(self,path_doc,chunk_size,overlap,sentence_split=False):
+        self.get_documents(path_doc=path_doc)
+        docs=self.docs
+        for doc in docs:
+            self.get_chunks_contents_from_1_doc(file_name=doc["source"],
+                                                content_page=doc["content_page"],
+                                                chunk_size=chunk_size,
+                                                overlap=overlap,
+                                                sentence_split=sentence_split)
+    def reset_splitter(self):
+        self.docs=[]
+        self.chunks=[]
+class Retriever():
+    def __init__(self,embed_model: EmbeddingModel):
+        self.embedding_model=embed_model
+        # The index is a list of (Id(int),chunk); chunk needs the size DIM for the Embeddings
+        self.index=[]
+    def add_elements_to_index(self,chunks):
+        # chunks is a list of chunk
+        num = len(self.index)
+        for chunk in chunks:
+            self.index.append([num,chunk])
+            num+=1
+    def search_best(self,query,number_of_hits=3):
+        # query is a str
+        query_embed = self.embedding_model.get_embeddings(texts=[query]).to(device).reshape(1,self.embedding_model.dim)
+        results=[]
+        index=self.index
+        scores=[]
+        for item in index:
+            id,chunk = item
+            sim = self.embedding_model.compute_cos_sim_embed(embed1=query_embed,embed2=chunk.embedding)
+            scores.append((id,chunk,sim))
+        results=sorted(scores,key=lambda x:x[2],reverse=True)[:min(number_of_hits,len(index))]
+        return results
+    def reset_Retriever_index(self):
+        self.index=[]
+class RAG():
+    def __init__(self,CONFIG):
+        self.foundation_model=FoundationModel(FOUND_MODEL_PATH=CONFIG['FOUND_MODEL_PATH'])
+        self.Embedding_model=EmbeddingModel(EMBEDD_MODEL_PATH=CONFIG['EMBEDD_MODEL_PATH'])
+        self.splitter=Splitter(self.Embedding_model)
+        self.retriever=Retriever(self.Embedding_model)
+        self.dim_embed = CONFIG['DIM_EMBED']
+        self.chunk_size = CONFIG['CHUNK_SIZE']
+        self.overlap = CONFIG['OVERLAP']
+    def reset_index(self):
+        self.retriever.reset_Retriever_index()
+        self.splitter.reset_splitter()
+    def load_documents_and_get_chunks(self,path,sentence_split=False):
+        self.splitter.get_chunks(path_doc=path,
+                                 chunk_size=self.chunk_size,
+                                 overlap=self.overlap,
+                                 sentence_split=sentence_split)
+        chunks = self.splitter.chunks
+        self.retriever.add_elements_to_index(chunks=chunks)
+    def get_retrieval(self,query,number_of_hits):
+        retrieved_info = self.retriever.search_best(query=query,number_of_hits=number_of_hits)
+        # It is the full information of the form (Id, chunk, sim)
+        retrieved=[]
+        for elem in retrieved_info:
+            i,chunk, distance=elem
+            retrieved.append({
+            "source": chunk.source,
+            "content": chunk.content
+        })
+        # We get rid of repeated items
+        return list(dict.fromkeys(retrieved))
+    def generate_response_with_context(self,query):
+      retrieved=self.get_retrieval(query=query,
+                                          number_of_hits=3)
+      return self.foundation_model.generate_response_with_context(prompt=query,
+                                                                   context=retrieved)
+CONFIG = {
+    'FOUND_MODEL_PATH':"mistralai/Mistral-7B-Instruct-v0.2", # medicalai/MedFound-7B",
+    'EMBEDD_MODEL_PATH':"all-MiniLM-L6-v2",
+    'DIM_EMBED':384,
+    'CHUNK_SIZE':300,
+    'OVERLAP':30
+        }