File size: 8,343 Bytes
5f261e2
 
f5b944f
5f261e2
 
 
 
 
f5b944f
e0408ee
 
bc66734
5f261e2
 
 
bc66734
5f261e2
e0408ee
bc66734
5f261e2
 
 
 
 
bc66734
5f261e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0408ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f261e2
 
 
 
 
 
 
 
 
e0408ee
 
 
 
 
 
 
 
 
5f261e2
 
 
e0408ee
 
 
 
 
 
 
 
 
5f261e2
e0408ee
 
5f261e2
 
 
e0408ee
5f261e2
 
 
e0408ee
 
 
 
5f261e2
 
 
 
e0408ee
5f261e2
e0408ee
 
5f261e2
e0408ee
 
 
 
5f261e2
e0408ee
 
27271f0
e0408ee
 
 
5f261e2
 
e0408ee
5f261e2
e0408ee
5f261e2
 
 
e0408ee
5f261e2
 
 
 
 
 
 
 
 
 
 
 
 
e0408ee
 
 
 
 
 
5f261e2
 
e0408ee
5f261e2
 
 
 
 
 
e0408ee
5f261e2
 
e0408ee
 
 
 
 
 
 
 
 
 
 
 
 
 
5f261e2
 
e0408ee
0a0c464
e0408ee
 
 
 
 
 
 
 
 
 
0a0c464
e0408ee
c71a8cd
84e4ebc
0d9d690
5f261e2
e0408ee
 
 
 
 
84e4ebc
0d9d690
 
e0408ee
 
84e4ebc
0d9d690
e0408ee
 
 
 
84e4ebc
0d9d690
 
e0408ee
0d9d690
 
 
 
e0408ee
 
84e4ebc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import gradio as gr
import logging
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
from simple_salesforce import Salesforce
from dotenv import load_dotenv
import zipfile
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables from .env file
load_dotenv()  # Load the .env file

# Get the Salesforce credentials from environment variables
sf_username = os.getenv("SF_USERNAME")
sf_password = os.getenv("SF_PASSWORD")
sf_security_token = os.getenv("SF_SECURITY_TOKEN")
sf_instance_url = os.getenv("SF_INSTANCE_URL")

# Check if the environment variables are correctly set
if not sf_username or not sf_password or not sf_security_token or not sf_instance_url:
    logger.error("❌ Salesforce credentials are missing from environment variables!")
    raise ValueError("Salesforce credentials are not properly set.")

# Salesforce connection
try:
    sf = Salesforce(
        username=sf_username,
        password=sf_password,
        security_token=sf_security_token,
        instance_url=sf_instance_url
    )
    logger.info("✅ Connected to Salesforce")
except Exception as e:
    logger.error(f"❌ Salesforce connection failed: {str(e)}")
    raise

# --- Extract zip files and read documents ---
def extract_zip(zip_path, extract_to):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        logger.info(f"Extracted {zip_path} to {extract_to}")
    except Exception as e:
        logger.error(f"Failed to extract {zip_path}: {str(e)}")
        raise

def load_documents(folder_path):
    documents = []
    sources = []
    for file in Path(folder_path).rglob("*.txt"):
        text = file.read_text(encoding="utf-8", errors="ignore")
        documents.append(text)
        sources.append(file.name)
    return documents, sources

# --- Chunking ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# --- Load model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Preprocessing ---
data_dir = Path("./data")
data_dir.mkdir(exist_ok=True)

doc_folders = [
    ("Company_Policies.zip", "Company_Policies"),
    ("HR_Policies.zip", "Hr_Policies"),
    ("Contract_Clauses.zip", "Contract_Clauses")
]

all_chunks = []
metadata = []

for zip_name, folder in doc_folders:
    zip_path = Path(zip_name)
    if not zip_path.exists():
        logger.error(f"Zip file {zip_name} not found")
        raise FileNotFoundError(f"Zip file {zip_name} not found")
    extract_path = data_dir / folder
    extract_path.mkdir(exist_ok=True)
    extract_zip(zip_path, extract_path)
    docs, sources = load_documents(extract_path)
    if not docs:
        logger.error(f"No documents found in {extract_path}")
        raise ValueError(f"No documents found in {extract_path}")
    for doc, src in zip(docs, sources):
        chunks = text_splitter.split_text(doc)
        all_chunks.extend(chunks)
        src_url = f"https://company.com/{folder}/{src}"
        metadata.extend([src_url] * len(chunks))

# --- Embeddings + FAISS index ---
embeddings = model.encode(all_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
logger.info("FAISS index built successfully")

# --- Create Record in Salesforce ---
def create_salesforce_record(query, answer, confidence_percentage, source_link):
    try:
        # Convert the confidence_percentage to Python float (to avoid numpy float32)
        confidence_percentage = float(confidence_percentage)

        # Data with correctly mapped field names
        data = {
            "Query__c": query,  # Field for User Query
            "Answer__c": answer,  # Field for Answer
            "Confidence_Percentage__c": confidence_percentage,  # Field for Confidence Score
            "Document_link__c": source_link,  # Field for Document Link
        }

        # Creating the record in Salesforce
        response = sf.chat_query_log__c.create(data)
        
        # Check if record was created successfully
        if 'id' in response:  # If the response contains an 'id', the record is created successfully
            record_id = response['id']
            logger.info(f"✅ Record created successfully in Salesforce with ID: {record_id}")
            return record_id  # Return the Salesforce record ID
        else:
            # Log the failure response
            logger.error(f"❌ Failed to create Salesforce record. Response: {response}")
            return None
    except Exception as e:
        # Log any error during record creation
        logger.error(f"Error creating Salesforce record: {str(e)}")
        return None

# --- Search & Answer ---
def answer_query(query):
    try:
        logger.info(f"Processing query: {query}")
        query_embedding = model.encode([query])
        D, I = index.search(np.array(query_embedding), k=3)
        top_chunks = [all_chunks[i] for i in I[0]]
        top_sources = [metadata[i] for i in I[0]]
        distances = D[0]

        relevant_chunks = [
            chunk for chunk, dist in zip(top_chunks, distances) if dist < 0.8
        ]
        relevant_sources = [
            src for src, dist in zip(top_sources, distances) if dist < 0.8
        ]

        if not relevant_chunks:
            return "No relevant information found.", "Confidence: 0%", "Source Link: None"

        answer = relevant_chunks[0].strip()
        min_distance = min(distances)
        confidence_percentage = max(0, 100 - (min_distance * 100))
        source_link = relevant_sources[0] if relevant_sources else "None"

        # Create Salesforce record for the query response
        record_id = create_salesforce_record(query, answer, confidence_percentage, source_link)

        if record_id:
            return (
                answer,
                f"Confidence: {confidence_percentage:.2f}%",
                f"Source Link: {source_link}",
                f"Salesforce Record ID: {record_id}"  # Display the Salesforce record ID
            )
        else:
            return (
                answer,
                f"Confidence: {confidence_percentage:.2f}%",
                f"Source Link: {source_link}",
                "Failed to create record in Salesforce"
            )
    except Exception as e:
        logger.error(f"Error in answer_query: {str(e)}")
        return f"Error: {str(e)}", "", "", ""

# --- Gradio Chatbot UI Design ---
def process_question(q, chat_history):
    if not q.strip():
        return chat_history + [("User", "Please enter a question.")], "", ""
    
    answer, confidence, source, record_id = answer_query(q)
    chat_history.append(("User", q))
    chat_history.append(("Bot", answer))
    
    return chat_history, confidence, source, record_id

# --- Chatbot UI with dynamic styling using elem_id ---
with gr.Blocks(title="Company Documents Q&A Chatbot", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 📚 **Company Policies Q&A Chatbot**")

    with gr.Row():
        with gr.Column(scale=3):
            question = gr.Textbox(
                label="Ask a Question",
                placeholder="What are the conditions for permanent employment status?",
                lines=1,
                interactive=True,
                elem_id="user-question",
                visible=True
            )
        with gr.Column(scale=1):
            submit_btn = gr.Button("Submit", variant="primary", elem_id="submit-btn")

    with gr.Row():
        with gr.Column():
            chat_history = gr.Chatbot(
                label="Chat History", 
                elem_id="chatbox",
                height=400,  # Set a fixed height
                show_label=False  # Hide the label to make the chat more clean
            )
            conf_out = gr.Markdown(label="Confidence", elem_id="confidence")
            source_out = gr.Markdown(label="Source Link", elem_id="source-link")
            record_out = gr.Markdown(label="Salesforce Record ID", elem_id="salesforce-id")

    submit_btn.click(fn=process_question, inputs=[question, chat_history], outputs=[chat_history, conf_out, source_out, record_out])

demo.launch(server_name="0.0.0.0", server_port=7860, share=True)