TahaFawzyElshrif commited on
Commit
2ebf9ad
·
1 Parent(s): 2e950a2

published first version

Browse files
Embedder/E5_Embeddedr.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer,util
2
+ from Embedder.Embedder import Embedder
3
+
4
+ class E5_Embeddedr(Embedder):
5
+ def __init__(self):
6
+ self.model_name = "intfloat/multilingual-e5-small"
7
+ self.model = SentenceTransformer(self.model_name)
8
+ self.embedding_size = 384 # Fixed fot this model
9
+ def embed(self,text):
10
+ '''
11
+ Embeds one text
12
+
13
+ Prefixed it with passage "passage" as e5 expect
14
+ '''
15
+ return self.model.encode(f"passage: {text}", normalize_embeddings=True)
16
+
17
+ #embed = E5_Embeddedr()
18
+ #embed.embed("مرحبا بك فى وى")
Embedder/Embedder.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ class Embedder:
2
+ def __init__(self) -> None:
3
+ pass
4
+ def embed(text):
5
+ pass
Embedder/__pycache__/E5_Embeddedr.cpython-311.pyc ADDED
Binary file (1.31 kB). View file
 
Embedder/__pycache__/E5_Embeddedr.cpython-312.pyc ADDED
Binary file (1.18 kB). View file
 
Embedder/__pycache__/Embedder.cpython-311.pyc ADDED
Binary file (696 Bytes). View file
 
Embedder/__pycache__/Embedder.cpython-312.pyc ADDED
Binary file (603 Bytes). View file
 
Models/GPT.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from Models.LLMModel import LLMModel
4
+ base_gpt_url = "https://router.huggingface.co/v1"
5
+
6
+ class GPT(LLMModel):
7
+ def __init__(self,model_name):
8
+ """
9
+ Top_k , stop_sequences is not supported by GPT
10
+
11
+ """
12
+ super().__init__()
13
+ self.model_name = model_name
14
+ self.client = OpenAI(base_url=base_gpt_url,api_key=os.environ["HF_TOKEN"])
15
+
16
+ def send_message(self,messages_json):
17
+ response = self.client.chat.completions.create(
18
+ model = self.model_name,
19
+ messages=messages_json,
20
+ temperature=self.temperature,
21
+
22
+ max_tokens=self.max_tokens )
23
+
24
+ return (response.choices[0].message.content)
25
+
Models/Gemini.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from Models.LLMModel import LLMModel
3
+ import os
4
+
5
+ class Gemini(LLMModel):
6
+ def __init__(self,model_name='gemini-1.5-flash'):
7
+ super().__init__()
8
+ self.model_name = model_name
9
+ genai.configure(api_key=os.environ["GEMINI_API_KEY"])
10
+ self.model = genai.GenerativeModel(self.model_name)
11
+ self.set_config()
12
+
13
+
14
+ def set_config(self,temperature=1,top_k=40,top_p=.85,stop_sequences=None,max_tokens=200):
15
+
16
+ super().set_config(temperature,top_k,top_p,stop_sequences,max_tokens)
17
+ self.config = genai.types.GenerationConfig(
18
+ temperature=self.temperature,
19
+ max_output_tokens = self.max_tokens,
20
+ top_p =self.top_p,
21
+ top_k =self.top_k,
22
+ stop_sequences = self.stop_sequences
23
+ )
24
+
25
+
26
+ def send_message(self,prompt):
27
+ if not isinstance(prompt,str):
28
+ prompt = str(prompt)
29
+ response = self.model.generate_content((prompt),generation_config=self.config)
30
+ return str(response.text)
31
+
32
+
Models/LLMModel.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class LLMModel:
2
+ def __init__(self):
3
+ self.model_name = ""
4
+ self.temperature=1
5
+ self.top_k=40
6
+ self.top_p=.85
7
+ self.stop_sequences=[]
8
+ self.max_tokens=200
9
+
10
+ def set_config(self,temperature=1,top_k=40,top_p=.85,stop_sequences=[],max_tokens=200):
11
+ """
12
+ Set the configuration for the model (Some Parameters may not work according to model)
13
+ """
14
+ self.temperature=temperature
15
+ self.top_k=top_k
16
+ self.top_p=top_p
17
+ self.stop_sequences=stop_sequences
18
+ self.max_tokens = max_tokens
19
+
20
+ def send_message(self,messages_json):
21
+ pass
22
+
Models/Prompts.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PromptHead = """أنت مساعد متخصص في الإسعافات الأولية والطوارئ الطبية.
2
+ وظيفتك هي تقديم نصائح وإرشادات آمنة وعملية لمواجهة حالات الطوارئ الصحية البسيطة، مثل الجروح، الحروق، الاختناق، النزيف، الإغماء، أو أي حادث منزلي أو خارجي.
3
+
4
+ عند الإجابة:
5
+ 1. قدم خطوات واضحة ومرتبة (مثل خطوة 1، خطوة 2...).
6
+ 2. لا تعطي تعليمات قد تكون خطيرة بدون تحذير واضح.
7
+ 3. شجع المستخدم على الاتصال بالإسعاف أو الطبيب إذا كانت الحالة خطيرة.
8
+ 4. لا تكتب معلومات طبية متقدمة أو تشخيصات، ركز فقط على الإسعافات الأولية.
9
+ 5. استخدم لغة بسيطة وسهلة الفهم.
10
+
11
+ مثال على الاستجابة:
12
+ سؤال المستخدم: "كيف أوقف نزيف من جرح في الإصبع؟"
13
+ ردك:
14
+ 1. نظف الجرح بلطف بالماء الجاري.
15
+ 2. ضع قطعة شاش نظيفة على الجرح واضغط برفق لوقف النزيف.
16
+ 3. إذا استمر النزيف أكثر من 10 دقائق، اتصل بالإسعاف فوراً.
17
+ 4. غطِّ الجرح بضمادة نظيفة بعد توقف النزيف.
18
+ """
19
+
20
+
21
+
22
+ get_summary_prompt = lambda x: "اعطنى ملخص فى سطر واحد او اثنين بالكثير للنص الاتى:\n" + x
23
+
24
+
25
+ final_prompt = lambda query, context: (
26
+ f"أجب على السؤال التالي: {query}\n"
27
+ f"اعتمد فقط على المعلومات الواردة في النص التالي: {context}\n"
28
+ )
Models/Utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models.Gemini import Gemini
2
+ from Models.GPT import GPT
3
+
4
+ message_user = lambda x: ({"role": "user", "content": x})
5
+ message_system = lambda x: ({"role": "system", "content": x})
6
+
7
+
8
+ # Core Functions here ,if want more write in backend
9
+ def get_specific_model(model_name):
10
+ if model_name in ["gemini-1.5-flash","gemini"]:
11
+ return Gemini()
12
+ elif model_name in ["openai/gpt-oss-120b","gpt 120"]:
13
+ return GPT("openai/gpt-oss-20b")
14
+ else :
15
+ return GPT("openai/gpt-oss-120b")
16
+
Models/__pycache__/GPT.cpython-311.pyc ADDED
Binary file (1.69 kB). View file
 
Models/__pycache__/GPT.cpython-312.pyc ADDED
Binary file (1.59 kB). View file
 
Models/__pycache__/Gemini.cpython-311.pyc ADDED
Binary file (2.41 kB). View file
 
Models/__pycache__/Gemini.cpython-312.pyc ADDED
Binary file (2.17 kB). View file
 
Models/__pycache__/LLMModel.cpython-311.pyc ADDED
Binary file (1.38 kB). View file
 
Models/__pycache__/LLMModel.cpython-312.pyc ADDED
Binary file (1.28 kB). View file
 
Models/__pycache__/Prompts.cpython-312.pyc ADDED
Binary file (2.15 kB). View file
 
Models/__pycache__/Utils.cpython-311.pyc ADDED
Binary file (2.12 kB). View file
 
Models/__pycache__/Utils.cpython-312.pyc ADDED
Binary file (904 Bytes). View file
 
OLAP_Conn/DuckConn.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ from OLAP_Conn.OLAP_Connection import OLAP_Connection
3
+ from sentence_transformers import util
4
+
5
+ class DuckConn(OLAP_Connection):
6
+ def __init__(self,path_duckdb="first_aid.duckdb"):
7
+ super().__init__()
8
+ self.path_duckdb = path_duckdb
9
+ self.con = duckdb.connect(self.path_duckdb)
10
+
11
+
12
+ def make_data_frame(self,data_,name):
13
+ self.con.register(name, data_)
14
+ self.con.execute("CREATE TABLE IF NOT EXISTS documents AS SELECT * FROM "+name)
15
+ self.con.commit()
16
+
17
+
18
+ def get_relevant_docs(self, embedded_query, top_k=3,limit=100):
19
+ # Retrive docs
20
+ docs = self.con.execute(f"SELECT * FROM documents;").fetchall()
21
+
22
+
23
+ # Calcualte distance
24
+ scored_docs = []
25
+ for page_content, embedding_doc in docs:
26
+ score = util.cos_sim(embedded_query, embedding_doc)
27
+ scored_docs.append((page_content, score))
28
+
29
+ # Sort Desc
30
+ scored_docs.sort(key=lambda x: -x[1])
31
+
32
+
33
+
34
+ # Return top k result
35
+ return [doc[0] for doc in scored_docs[:top_k]]
36
+
37
+ def close(self):
38
+ self.con.commit()
39
+ self.con.close()
OLAP_Conn/OLAP_Connection.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ class OLAP_Connection:
2
+ def __init__(self,):
3
+ pass
4
+ def make_data_frame(self,data_,name):
5
+ pass
6
+ def get_relevant_docs(self, embedded_query, top_k=3):
7
+ pass
OLAP_Conn/__pycache__/DuckConn.cpython-312.pyc ADDED
Binary file (2.53 kB). View file
 
OLAP_Conn/__pycache__/OLAP_Connection.cpython-312.pyc ADDED
Binary file (840 Bytes). View file
 
RAG/RAG_Retrival.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ from tqdm.notebook import tqdm as tqdmk
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.llms import HuggingFaceHub
6
+ from langchain import PromptTemplate
7
+ from langchain.document_loaders import PyPDFLoader
8
+ import pandas as pd
9
+ import duckdb
10
+ import numpy as np
11
+ import os
12
+
13
+
14
+ class RAG_Retrival:
15
+ def __init__(self,db,model,embedder):
16
+ self.conn = db
17
+ self.model = model
18
+ self.embedder = embedder
19
+
20
+ def read_data(self,path_data):
21
+ # Count total files first for tqdm's total
22
+ total_files = sum(len(files) for _, _, files in os.walk(path_data))
23
+ all_text = ""
24
+ with tqdm(total=total_files, desc="Reading files", unit="file") as pbar:
25
+ for root, dirs, files in os.walk(path_data):
26
+ for file in files:
27
+ full_path = os.path.join(root, file)
28
+ if full_path.endswith(".txt"):
29
+ all_text += self.load_text_file(full_path)
30
+ elif full_path.endswith(".pdf"):
31
+ all_text += self.load_pdf(full_path)
32
+
33
+
34
+ pbar.update(1)
35
+
36
+ return all_text
37
+
38
+
39
+ def load_text_file(self,path):
40
+ text = ""
41
+ with open(path, 'r') as file:
42
+ for line in file:
43
+ text += line
44
+
45
+ return text
46
+
47
+
48
+ def load_pdf(self,pdf_folder):
49
+ loader = PyPDFLoader(pdf_folder)
50
+ pages = loader.load_and_split()
51
+ text = "\n".join([doc.page_content for doc in pages])
52
+ return text
53
+
54
+
55
+ def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=chunk_size,
58
+ chunk_overlap=chunk_overlap,
59
+ length_function=len,
60
+ is_separator_regex=is_separator_regex,
61
+ )
62
+ docs = text_splitter.create_documents([text])
63
+ for i, d in enumerate(docs):
64
+ d.metadata = {"doc_id": i}
65
+ return docs
66
+
67
+ def prepare_text_df(self,docs):
68
+ # Get the page_content from the documents and create a new list
69
+ content_list = [doc.page_content for doc in docs]
70
+ # Send one page_content at a time
71
+ print("Making embedding...")
72
+ embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)]
73
+ print("Finished embedding...")
74
+
75
+ # Create a dataframe to ingest it to the database
76
+ dataframe = pd.DataFrame({
77
+ 'page_content': content_list,
78
+ 'embeddings': embeddings})
79
+ return dataframe
80
+
81
+ def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
82
+ all_texts = self.read_data(path)
83
+ docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex)
84
+ dataframe = self.prepare_text_df(docs)
85
+ self.upload_file(dataframe)
86
+ return dataframe
87
+
88
+
89
+ def upload_file(self,embedding_df,name='first_aid'):
90
+ '''
91
+ Upload data and close database to be commited
92
+ '''
93
+ self.conn.make_data_frame(embedding_df,name)
94
+ self.conn.close()
95
+
96
+ def query_relevant(self,user_query):
97
+ embedded_query = self.embedder.embed(user_query)
98
+ result = self.conn.get_relevant_docs(embedded_query)
99
+ return result
RAG/__pycache__/RAG_Retrival.cpython-312.pyc ADDED
Binary file (5.4 kB). View file
 
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from typing import List, Dict
4
+
5
+ app = FastAPI()
6
+
7
+ # Define the request body model
8
+ class Message(BaseModel):
9
+ role: str
10
+ content: str
11
+
12
+ @app.post("/chat")
13
+ async def chat(messages: List[Message]):
14
+ # Convert Pydantic objects to dict
15
+ messages_data = [msg.dict() for msg in messages]
16
+
17
+ # Example: send to model (here just a placeholder)
18
+ response_text = f"Received {len(messages_data)} messages. First message: {messages_data[0]['content']}"
19
+
20
+ return {"status": "success", "response": response_text}
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ sentence_transformers
4
+ google.generativeai
5
+ openai
6
+ duckdb
7
+ tqdm
8
+ langchain
9
+ langchain-community
10
+ pypdf
11
+ pandas
12
+ numpy