Adityashriv commited on
Commit
043e105
·
verified ·
1 Parent(s): 289889f

Prompt modification

Browse files
Files changed (1) hide show
  1. app.py +223 -222
app.py CHANGED
@@ -1,223 +1,224 @@
1
- import os
2
- import re
3
- import warnings
4
- import pandas as pd
5
- import backoff
6
- from datetime import datetime
7
- from dotenv import load_dotenv
8
- from langchain_ollama import OllamaEmbeddings, ChatOllama
9
- from langchain_text_splitters import MarkdownHeaderTextSplitter
10
- from langchain_community.vectorstores import FAISS
11
- from langchain_community.docstore.in_memory import InMemoryDocstore
12
- from langchain_core.prompts import ChatPromptTemplate
13
- from langchain_core.output_parsers import StrOutputParser
14
- from langchain_core.runnables import RunnablePassthrough
15
- from docling.document_converter import DocumentConverter
16
- from opik import Opik, track, evaluate
17
- from opik.evaluation.metrics import Hallucination, AnswerRelevance
18
- from opik.evaluation import models
19
- import litellm
20
- import opik
21
- from litellm.integrations.opik.opik import OpikLogger
22
- from litellm import completion, APIConnectionError
23
- from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint
24
- from fastapi import FastAPI, UploadFile, File, HTTPException, Query
25
-
26
-
27
-
28
- app = FastAPI()
29
-
30
- # Load environment variables
31
- def load_env():
32
- load_dotenv()
33
- os.environ.setdefault("OPIK_PROJECT_NAME", "Deepseek_eval")
34
- os.environ.setdefault("OPIK_API_KEY", "BX9OYn3NZBKuztCxL4XvMOeeI")
35
-
36
- def initialize_opik():
37
- opik_logger = OpikLogger()
38
- litellm.callbacks = [opik_logger]
39
- opik.configure(api_key="BX9OYn3NZBKuztCxL4XvMOeeI",workspace="komalgupta991000-gmail-com",force=True)
40
-
41
-
42
- # Initialize Opik and load environment variables
43
- load_env()
44
- initialize_opik()
45
-
46
- # Initialize Opik Client
47
- dataset = Opik().get_or_create_dataset(
48
- name="Refugee_crises_mental_health",
49
- description="Dataset on refugee crises and mental health"
50
- )
51
-
52
- @app.post("/upload_dataset/")
53
- def upload_dataset(file: UploadFile = File(...)):
54
- try:
55
- df = pd.read_excel(file.file)
56
- dataset.insert(df.to_dict(orient='records'))
57
- return {"message": "Dataset uploaded successfully"}
58
- except Exception as e:
59
- raise HTTPException(status_code=500, detail=str(e))
60
-
61
- # To use the uploaded dataset in the evaluation task manually
62
- # def upload_dataset():
63
- # df = pd.read_excel("dataset.xlsx")
64
- # dataset.insert(df.to_dict(orient='records'))
65
- # return "Dataset uploaded successfully"
66
-
67
- # Initialize LLM Models
68
- model = ChatOllama(model="deepseek-r1:7b", base_url="http://localhost:11434", temperature=0.2, max_tokens=200)
69
- # model1 = models.LiteLLMChatModel(model_name="ollama/gemma2:2b", base_url="http://localhost:11434")
70
-
71
- # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True})
72
- # model = ChatHuggingFace(llm=HuggingFaceEndpoint(repo_id="HuggingFaceH4/zephyr-7b-beta", task="text-generation", max_new_tokens=512, temperature=0.2))
73
-
74
- # Convert Document to Markdown
75
- def load_and_convert_document(file_path):
76
- return DocumentConverter().convert(file_path).document.export_to_markdown()
77
-
78
- # Markdown Splitting
79
- def get_markdown_splits(markdown_content):
80
- splitter = MarkdownHeaderTextSplitter([("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")])
81
- return splitter.split_text(markdown_content)
82
-
83
- # Vector Store Setup
84
- def setup_vector_store(documents):
85
- embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
86
- vectorstore = FAISS.from_documents(documents, embeddings)
87
- vectorstore.save_local("deepseek_ollama/deepseek_db")
88
- return vectorstore
89
-
90
- # Load Vector Store
91
- embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
92
- vectorstore = FAISS.load_local("deepseek_ollama/deepseek_db", embeddings, allow_dangerous_deserialization=True)
93
- retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 2})
94
-
95
- # Create RAG Chain
96
- def create_rag_chain(retriever):
97
- prompt_template = ChatPromptTemplate.from_template(
98
- """
99
- You are an AI research assistant specializing in the study of the refugee crisis and its impact on child mental health.
100
- - Your primary role is to assist researchers,professionals by analyzing, summarizing, and generating insights based on provided research papers and academic data
101
- Context: You will be given a large dataset containing research papers and studies on this topic. Your responses must be strictly derived from the provided research data and should focus solely on answering user queries related to the refugee crisis and its effects on child mental health.
102
- Strict Guidelines:
103
- Scope Restriction: You *must not answer any questions outside the refugee crisis and child mental health domain.* If a user query is unrelated, politely refuse to answer.
104
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
105
- Don't use your own knowledge or experience to answer questions.
106
- Fact-Based Responses: Your answers should be strictly based on the provided research data. Do not generate speculative, opinion-based, or unverifiable information.
107
- Academic Integrity: Provide responses in a structured, well-cited manner, ensuring academic rigor and clarity.
108
- Example Scenarios:
109
- Allowed Queries:
110
- "What are the psychological effects of forced displacement on children?"
111
- "How does prolonged refugee status impact child cognitive development?"
112
- "Are there any studies on PTSD prevalence in refugee children?"
113
-
114
- Few example of Restricted Queries:
115
- "Tell me about the global economic impact of the refugee crisis."
116
- "Can you summarize recent political policies on immigration?"
117
- "What are some coping mechanisms for stress in general?"
118
- Any thing outside the refugee crisis and child mental health domain should be avoided.
119
- Your primary goal is to advance research in this field by providing data-backed, insightful, and academically sound responses.
120
-
121
-
122
- VALIDATION STEP (REQUIRED):
123
- 1. First, determine if the question is related to refugee crisis and child mental health.
124
- 2. If unrelated, respond ONLY with: "This question is outside my specialized domain of refugee crisis and child mental health research. I can only answer questions related to these topics."
125
- 3. Do not provide any other information for unrelated questions.
126
- Question: {question}
127
- Context: {context}
128
- Answer:
129
- """
130
-
131
- )
132
- return (
133
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
134
- | prompt_template
135
- | model
136
- | StrOutputParser()
137
- )
138
-
139
- def format_docs(docs):
140
- return "\n\n".join(doc.page_content for doc in docs)
141
-
142
- def clean_response(response):
143
- return re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
144
-
145
- rag_chain = create_rag_chain(retriever)
146
-
147
- @track()
148
- def llm_chain(input_text):
149
- try:
150
- context = "\n".join(doc.page_content for doc in retriever.invoke(input_text))
151
- response = "".join(chunk for chunk in rag_chain.stream(input_text) if isinstance(chunk, str))
152
- return {"response": clean_response(response), "context_used": context}
153
- except Exception as e:
154
- return {"error": str(e)}
155
-
156
- def evaluation_task(x):
157
- try:
158
- result = llm_chain(x['user_question'])
159
- return {"input": x['user_question'], "output": result["response"], "context": result["context_used"], "expected": x['expected_output']}
160
- except Exception as e:
161
- return {"input": x['user_question'], "output": "", "context": x['expected_output']}
162
-
163
- # experiment_name = f"Deepseek_{dataset.name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
164
- # metrics = [Hallucination(model=model1), AnswerRelevance(model=model1)]
165
-
166
-
167
- @app.post("/run_evaluation/")
168
- @backoff.on_exception(backoff.expo, (APIConnectionError, Exception), max_tries=3, max_time=300)
169
- def run_evaluation():
170
- experiment_name = f"Deepseek_{dataset.name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
171
- metrics = [Hallucination(), AnswerRelevance()]
172
- try:
173
- evaluate(
174
- experiment_name=experiment_name,
175
- dataset=dataset,
176
- task=evaluation_task,
177
- scoring_metrics=metrics,
178
- experiment_config={"model": model},
179
- task_threads=2
180
- )
181
- return {"message": "Evaluation completed successfully"}
182
- except Exception as e:
183
- raise HTTPException(status_code=500, detail=str(e))
184
-
185
-
186
- # @backoff.on_exception(backoff.expo, (APIConnectionError, Exception), max_tries=3, max_time=300)
187
- # def run_evaluation():
188
- # return evaluate(experiment_name=experiment_name, dataset=dataset, task=evaluation_task, scoring_metrics=metrics, experiment_config={"model": model}, task_threads=2)
189
-
190
- # run_evaluation()
191
-
192
- # Create Vector Database
193
- def create_db():
194
- source = r'AI Agent'
195
- all_documents = []
196
- for filename in os.listdir(source):
197
- file_path = os.path.join(source, filename)
198
- markdown_content = load_and_convert_document(file_path)
199
- all_documents.extend(get_markdown_splits(markdown_content))
200
- setup_vector_store(all_documents)
201
- return "Database created successfully"
202
-
203
-
204
- @track()
205
- @app.get("/query/")
206
- def chain(input_text: str = Query(..., description="Enter your question")):
207
- try:
208
- response= llm_chain(input_text)
209
- return response["response"]
210
-
211
- except Exception as e:
212
- raise HTTPException(status_code=500, detail=str(e))
213
-
214
- # if __name__ == "__main__":
215
-
216
- # questions=[ "famous places to visit in india","what is the elligibility criteria to get green card in usa"]
217
- # # Questions for retrieval
218
- # # Answer questions
219
- # for question in questions:
220
- # print(f"Question: {question}")
221
- # for chunk in llm_chain(question):
222
- # print(chunk, end="", flush=True)
 
223
  # print("\n" + "-" * 50 + "\n")
 
1
+ import os
2
+ import re
3
+ import warnings
4
+ import pandas as pd
5
+ import backoff
6
+ from datetime import datetime
7
+ from dotenv import load_dotenv
8
+ from langchain_ollama import OllamaEmbeddings, ChatOllama
9
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_community.docstore.in_memory import InMemoryDocstore
12
+ from langchain_core.prompts import ChatPromptTemplate
13
+ from langchain_core.output_parsers import StrOutputParser
14
+ from langchain_core.runnables import RunnablePassthrough
15
+ from docling.document_converter import DocumentConverter
16
+ from opik import Opik, track, evaluate
17
+ from opik.evaluation.metrics import Hallucination, AnswerRelevance
18
+ from opik.evaluation import models
19
+ import litellm
20
+ import opik
21
+ from litellm.integrations.opik.opik import OpikLogger
22
+ from litellm import completion, APIConnectionError
23
+ from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint
24
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Query
25
+
26
+
27
+
28
+ app = FastAPI()
29
+
30
+ # Load environment variables
31
+ def load_env():
32
+ load_dotenv()
33
+ os.environ.setdefault("OPIK_PROJECT_NAME", "Deepseek_eval")
34
+ os.environ.setdefault("OPIK_API_KEY", "BX9OYn3NZBKuztCxL4XvMOeeI")
35
+
36
+ def initialize_opik():
37
+ opik_logger = OpikLogger()
38
+ litellm.callbacks = [opik_logger]
39
+ opik.configure(api_key="BX9OYn3NZBKuztCxL4XvMOeeI",workspace="komalgupta991000-gmail-com",force=True)
40
+
41
+
42
+ # Initialize Opik and load environment variables
43
+ load_env()
44
+ initialize_opik()
45
+
46
+ # Initialize Opik Client
47
+ dataset = Opik().get_or_create_dataset(
48
+ name="Refugee_crises_mental_health",
49
+ description="Dataset on refugee crises and mental health"
50
+ )
51
+
52
+ @app.post("/upload_dataset/")
53
+ def upload_dataset(file: UploadFile = File(...)):
54
+ try:
55
+ df = pd.read_excel(file.file)
56
+ dataset.insert(df.to_dict(orient='records'))
57
+ return {"message": "Dataset uploaded successfully"}
58
+ except Exception as e:
59
+ raise HTTPException(status_code=500, detail=str(e))
60
+
61
+ # To use the uploaded dataset in the evaluation task manually
62
+ # def upload_dataset():
63
+ # df = pd.read_excel("dataset.xlsx")
64
+ # dataset.insert(df.to_dict(orient='records'))
65
+ # return "Dataset uploaded successfully"
66
+
67
+ # Initialize LLM Models
68
+ model = ChatOllama(model="deepseek-r1:7b", base_url="http://localhost:11434", temperature=0.2, max_tokens=200)
69
+ # model1 = models.LiteLLMChatModel(model_name="ollama/gemma2:2b", base_url="http://localhost:11434")
70
+
71
+ # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True})
72
+ # model = ChatHuggingFace(llm=HuggingFaceEndpoint(repo_id="HuggingFaceH4/zephyr-7b-beta", task="text-generation", max_new_tokens=512, temperature=0.2))
73
+
74
+ # Convert Document to Markdown
75
+ def load_and_convert_document(file_path):
76
+ return DocumentConverter().convert(file_path).document.export_to_markdown()
77
+
78
+ # Markdown Splitting
79
+ def get_markdown_splits(markdown_content):
80
+ splitter = MarkdownHeaderTextSplitter([("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")])
81
+ return splitter.split_text(markdown_content)
82
+
83
+ # Vector Store Setup
84
+ def setup_vector_store(documents):
85
+ embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
86
+ vectorstore = FAISS.from_documents(documents, embeddings)
87
+ vectorstore.save_local("deepseek_ollama/deepseek_db")
88
+ return vectorstore
89
+
90
+ # Load Vector Store
91
+ embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
92
+ vectorstore = FAISS.load_local("deepseek_ollama/deepseek_db", embeddings, allow_dangerous_deserialization=True)
93
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 2})
94
+
95
+ # Create RAG Chain
96
+ def create_rag_chain(retriever):
97
+ prompt_template = ChatPromptTemplate.from_template(
98
+ """
99
+ You are an AI research assistant specializing in the study of the refugee crisis and its impact on child mental health.
100
+ - Your primary role is to assist researchers,professionals by analyzing, summarizing, and generating insights based on provided research papers and academic data
101
+ Context: You will be given a large dataset containing research papers and studies on this topic. Your responses must be strictly derived from the provided research data and should focus solely on answering user queries related to the refugee crisis and its effects on child mental health.
102
+ Strict Guidelines:
103
+ Scope Restriction: You *must not answer any questions outside the refugee crisis and child mental health domain.* If a user query is unrelated, politely refuse to answer.
104
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
105
+ Don't use your own knowledge or experience to answer questions.
106
+ For user generic queries like Hi, hello , and what you can do for me , who are you ? give a pleasent reply back with a greeting and your role.
107
+ Fact-Based Responses: Your answers should be strictly based on the provided research data. Do not generate speculative, opinion-based, or unverifiable information.
108
+ Academic Integrity: Provide responses in a structured, well-cited manner, ensuring academic rigor and clarity.
109
+ Example Scenarios:
110
+ Allowed Queries:
111
+ "What are the psychological effects of forced displacement on children?"
112
+ "How does prolonged refugee status impact child cognitive development?"
113
+ "Are there any studies on PTSD prevalence in refugee children?"
114
+
115
+ Few example of Restricted Queries:
116
+ "Tell me about the global economic impact of the refugee crisis."
117
+ "Can you summarize recent political policies on immigration?"
118
+ "What are some coping mechanisms for stress in general?"
119
+ Any thing outside the refugee crisis and child mental health domain should be avoided.
120
+ Your primary goal is to advance research in this field by providing data-backed, insightful, and academically sound responses.
121
+
122
+
123
+ VALIDATION STEP (REQUIRED):
124
+ 1. First, determine if the question is related to refugee crisis and child mental health.
125
+ 2. If unrelated, respond ONLY with: "This question is outside my specialized domain of refugee crisis and child mental health research. I can only answer questions related to these topics."
126
+ 3. Do not provide any other information for unrelated questions.
127
+ Question: {question}
128
+ Context: {context}
129
+ Answer:
130
+ """
131
+
132
+ )
133
+ return (
134
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
135
+ | prompt_template
136
+ | model
137
+ | StrOutputParser()
138
+ )
139
+
140
+ def format_docs(docs):
141
+ return "\n\n".join(doc.page_content for doc in docs)
142
+
143
+ def clean_response(response):
144
+ return re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
145
+
146
+ rag_chain = create_rag_chain(retriever)
147
+
148
+ @track()
149
+ def llm_chain(input_text):
150
+ try:
151
+ context = "\n".join(doc.page_content for doc in retriever.invoke(input_text))
152
+ response = "".join(chunk for chunk in rag_chain.stream(input_text) if isinstance(chunk, str))
153
+ return {"response": clean_response(response), "context_used": context}
154
+ except Exception as e:
155
+ return {"error": str(e)}
156
+
157
+ def evaluation_task(x):
158
+ try:
159
+ result = llm_chain(x['user_question'])
160
+ return {"input": x['user_question'], "output": result["response"], "context": result["context_used"], "expected": x['expected_output']}
161
+ except Exception as e:
162
+ return {"input": x['user_question'], "output": "", "context": x['expected_output']}
163
+
164
+ # experiment_name = f"Deepseek_{dataset.name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
165
+ # metrics = [Hallucination(model=model1), AnswerRelevance(model=model1)]
166
+
167
+
168
+ @app.post("/run_evaluation/")
169
+ @backoff.on_exception(backoff.expo, (APIConnectionError, Exception), max_tries=3, max_time=300)
170
+ def run_evaluation():
171
+ experiment_name = f"Deepseek_{dataset.name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
172
+ metrics = [Hallucination(), AnswerRelevance()]
173
+ try:
174
+ evaluate(
175
+ experiment_name=experiment_name,
176
+ dataset=dataset,
177
+ task=evaluation_task,
178
+ scoring_metrics=metrics,
179
+ experiment_config={"model": model},
180
+ task_threads=2
181
+ )
182
+ return {"message": "Evaluation completed successfully"}
183
+ except Exception as e:
184
+ raise HTTPException(status_code=500, detail=str(e))
185
+
186
+
187
+ # @backoff.on_exception(backoff.expo, (APIConnectionError, Exception), max_tries=3, max_time=300)
188
+ # def run_evaluation():
189
+ # return evaluate(experiment_name=experiment_name, dataset=dataset, task=evaluation_task, scoring_metrics=metrics, experiment_config={"model": model}, task_threads=2)
190
+
191
+ # run_evaluation()
192
+
193
+ # Create Vector Database
194
+ def create_db():
195
+ source = r'AI Agent'
196
+ all_documents = []
197
+ for filename in os.listdir(source):
198
+ file_path = os.path.join(source, filename)
199
+ markdown_content = load_and_convert_document(file_path)
200
+ all_documents.extend(get_markdown_splits(markdown_content))
201
+ setup_vector_store(all_documents)
202
+ return "Database created successfully"
203
+
204
+
205
+ @track()
206
+ @app.get("/query/")
207
+ def chain(input_text: str = Query(..., description="Enter your question")):
208
+ try:
209
+ response= llm_chain(input_text)
210
+ return response["response"]
211
+
212
+ except Exception as e:
213
+ raise HTTPException(status_code=500, detail=str(e))
214
+
215
+ # if __name__ == "__main__":
216
+
217
+ # questions=[ "famous places to visit in india","what is the elligibility criteria to get green card in usa"]
218
+ # # Questions for retrieval
219
+ # # Answer questions
220
+ # for question in questions:
221
+ # print(f"Question: {question}")
222
+ # for chunk in llm_chain(question):
223
+ # print(chunk, end="", flush=True)
224
  # print("\n" + "-" * 50 + "\n")