Gopikanth123 commited on
Commit
74f9b4e
·
verified ·
1 Parent(s): 5ab6705

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -126
app.py CHANGED
@@ -1,126 +1,122 @@
1
- import os
2
- from dotenv import load_dotenv
3
- import gradio as gr
4
- from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate, Settings
5
- from llama_index.llms.huggingface import HuggingFaceInferenceAPI
6
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
- from sentence_transformers import SentenceTransformer
8
-
9
- load_dotenv()
10
-
11
- # Configure the Llama index settings
12
- Settings.llm = HuggingFaceInferenceAPI(
13
- model_name="meta-llama/Meta-Llama-3-8B-Instruct",
14
- tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
15
- context_window=3000,
16
- token=os.getenv("HF_TOKEN"),
17
- max_new_tokens=512,
18
- generate_kwargs={"temperature": 0.1},
19
- )
20
- Settings.embed_model = HuggingFaceEmbedding(
21
- model_name="BAAI/bge-small-en-v1.5"
22
- )
23
-
24
- # Define the directory for persistent storage and data
25
- PERSIST_DIR = "db"
26
- PDF_DIRECTORY = 'data' # Changed to the directory containing PDFs
27
-
28
- # Ensure directories exist
29
- os.makedirs(PDF_DIRECTORY, exist_ok=True)
30
- os.makedirs(PERSIST_DIR, exist_ok=True)
31
-
32
- # Variable to store current chat conversation
33
- current_chat_history = []
34
-
35
- def data_ingestion_from_directory():
36
- # Use SimpleDirectoryReader on the directory containing the PDF files
37
- documents = SimpleDirectoryReader(PDF_DIRECTORY).load_data()
38
- storage_context = StorageContext.from_defaults()
39
- index = VectorStoreIndex.from_documents(documents)
40
- index.storage_context.persist(persist_dir=PERSIST_DIR)
41
-
42
- def handle_query(query):
43
- chat_text_qa_msgs = [
44
- (
45
- "user",
46
- """
47
- You are the JackNJill Solutions chatbot. Your goal is to provide accurate, professional, and helpful answers to user queries based on the company's data. Always ensure your responses are clear and concise.
48
-
49
- Context:
50
- {context_str}
51
-
52
- Question:
53
- {query_str}
54
- """
55
- )
56
- ]
57
-
58
- text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
59
-
60
- # Load index from storage
61
- storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
62
- index = load_index_from_storage(storage_context)
63
-
64
- # Use chat history to enhance response
65
- context_str = ""
66
- for past_query, response in reversed(current_chat_history):
67
- if past_query.strip():
68
- context_str += f"User asked: '{past_query}'\nBot answered: '{response}'\n"
69
-
70
- query_engine = index.as_query_engine(text_qa_template=text_qa_template, context_str=context_str)
71
- answer = query_engine.query(query)
72
-
73
- if hasattr(answer, 'response'):
74
- response = answer.response
75
- elif isinstance(answer, dict) and 'response' in answer:
76
- response = answer['response']
77
- else:
78
- response = "Sorry, as per my current knowledge I am unable to answer this question. Is there anything else I can help you with?"
79
-
80
- # Remove sensitive information and unwanted sections from the response
81
- sensitive_keywords = [PERSIST_DIR, PDF_DIRECTORY, "/", "\\", ".pdf", ".doc", ".txt"]
82
- for keyword in sensitive_keywords:
83
- response = response.replace(keyword, "")
84
-
85
- # Remove sections starting with specific keywords
86
- unwanted_sections = ["Page Label","Page Label:","page_label","page_label:","file_path:","file_path",]
87
- for section in unwanted_sections:
88
- if section in response:
89
- response = response.split(section)[0]
90
-
91
- # Additional cleanup for any remaining artifacts from replacements
92
- response = ' '.join(response.split())
93
-
94
- # Update current chat history
95
- current_chat_history.append((query, response))
96
-
97
- return response
98
-
99
- # Example usage: Process PDF ingestion from directory
100
- print("Processing PDF ingestion from directory:", PDF_DIRECTORY)
101
- data_ingestion_from_directory()
102
-
103
- # Define the input and output components for the Gradio interface
104
- input_component = gr.Textbox(
105
- show_label=False,
106
- placeholder="Ask me anything about JackNJill Solutions..."
107
- )
108
-
109
- output_component = gr.Textbox()
110
-
111
- # Function to handle queries
112
- def chatbot_handler(query):
113
- response = handle_query(query)
114
- return response
115
-
116
- # Create the Gradio interface
117
- interface = gr.Interface(
118
- fn=chatbot_handler,
119
- inputs=input_component,
120
- outputs=output_component,
121
- title="Welcome to JackNJill Solutions",
122
- description="I am here to assist you with any questions you have about JackNJill Solutions. How can I help you today?"
123
- )
124
-
125
- # Launch the Gradio interface
126
- interface.launch()
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import gradio as gr
4
+ from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate, Settings
5
+ from llama_index.llms.huggingface import HuggingFaceInferenceAPI
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+
8
+ load_dotenv()
9
+
10
+ # Configure the Llama index settings
11
+ Settings.llm = HuggingFaceInferenceAPI(
12
+ model_name="meta-llama/Meta-Llama-3-8B-Instruct",
13
+ tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
14
+ context_window=3000,
15
+ token=os.getenv("HF_TOKEN"),
16
+ max_new_tokens=512,
17
+ generate_kwargs={"temperature": 0.1},
18
+ )
19
+
20
+ Settings.embed_model = HuggingFaceEmbedding(
21
+ model_name="BAAI/bge-small-en-v1.5"
22
+ )
23
+
24
+ # Define the directory for persistent storage and data
25
+ PERSIST_DIR = "db"
26
+ PDF_DIRECTORY = 'data' # Directory containing PDFs
27
+
28
+ # Ensure directories exist
29
+ os.makedirs(PDF_DIRECTORY, exist_ok=True)
30
+ os.makedirs(PERSIST_DIR, exist_ok=True)
31
+
32
+ # Variable to store current chat conversation
33
+ current_chat_history = []
34
+
35
+ def data_ingestion_from_directory():
36
+ try:
37
+ # Use SimpleDirectoryReader on the directory containing the PDF files
38
+ documents = SimpleDirectoryReader(PDF_DIRECTORY).load_data()
39
+ storage_context = StorageContext.from_defaults()
40
+ index = VectorStoreIndex.from_documents(documents)
41
+ index.storage_context.persist(persist_dir=PERSIST_DIR)
42
+ except Exception as e:
43
+ print(f"Error during data ingestion: {e}")
44
+
45
+ def handle_query(query):
46
+ chat_text_qa_msgs = [
47
+ (
48
+ "user",
49
+ """
50
+ You are the JackNJill Solutions chatbot. Your goal is to provide accurate, professional, and helpful answers to user queries based on the company's data. Always ensure your responses are clear and concise.
51
+
52
+ Context:
53
+ {context_str}
54
+
55
+ Question:
56
+ {query_str}
57
+ """
58
+ )
59
+ ]
60
+
61
+ text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
62
+
63
+ # Load index from storage
64
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
65
+ index = load_index_from_storage(storage_context)
66
+
67
+ # Use chat history to enhance response
68
+ context_str = "\n".join([f"User asked: '{past_query}'\nBot answered: '{response}'"
69
+ for past_query, response in reversed(current_chat_history) if past_query.strip()])
70
+
71
+ query_engine = index.as_query_engine(text_qa_template=text_qa_template, context_str=context_str)
72
+ answer = query_engine.query(query)
73
+
74
+ response = answer.response if hasattr(answer, 'response') else answer.get('response', "I'm sorry, I can't answer that.")
75
+
76
+ # Remove sensitive information and unwanted sections from the response
77
+ sensitive_keywords = [PERSIST_DIR, PDF_DIRECTORY, "/", "\\", ".pdf", ".doc", ".txt"]
78
+ for keyword in sensitive_keywords:
79
+ response = response.replace(keyword, "")
80
+
81
+ # Remove sections starting with specific keywords
82
+ unwanted_sections = ["Page Label", "Page Label:", "page_label", "page_label:", "file_path:", "file_path"]
83
+ for section in unwanted_sections:
84
+ if section in response:
85
+ response = response.split(section)[0]
86
+
87
+ # Additional cleanup for any remaining artifacts from replacements
88
+ response = ' '.join(response.split())
89
+
90
+ # Update current chat history
91
+ current_chat_history.append((query, response))
92
+
93
+ return response
94
+
95
+ # Example usage: Process PDF ingestion from directory
96
+ print("Processing PDF ingestion from directory:", PDF_DIRECTORY)
97
+ data_ingestion_from_directory()
98
+
99
+ # Define the input and output components for the Gradio interface
100
+ input_component = gr.Textbox(
101
+ show_label=False,
102
+ placeholder="Ask me anything about JackNJill Solutions..."
103
+ )
104
+
105
+ output_component = gr.Textbox()
106
+
107
+ # Function to handle queries
108
+ def chatbot_handler(query):
109
+ response = handle_query(query)
110
+ return response
111
+
112
+ # Create the Gradio interface
113
+ interface = gr.Interface(
114
+ fn=chatbot_handler,
115
+ inputs=input_component,
116
+ outputs=output_component,
117
+ title="Welcome to JackNJill Solutions",
118
+ description="I am here to assist you with any questions you have about JackNJill Solutions. How can I help you today?"
119
+ )
120
+
121
+ # Launch the Gradio interface
122
+ interface.launch()