File size: 9,156 Bytes
12f0980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b646d8
12f0980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e5a25
12f0980
 
 
 
 
 
 
 
 
 
 
e7e5a25
12f0980
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import os
from PyPDF2 import PdfReader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from datetime import datetime
import gradio as gr
import traceback # Import traceback for detailed error logging
import pandas as pd # New: Import pandas for Excel processing

# IMPORTANT: Replace "your_groq_api_key_here" with your actual Groq API key.
# For Hugging Face Spaces, it's highly recommended to set this as a Space Secret
# named GROQ_API_KEY. If set as a secret, you can simply use:
# os.environ.get("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = "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"

# Global variable for QA chain
qa_chain = None

# Step 1: Process file (now supports PDF and Excel with enhanced cleaning)
def process_file(file_path):
    """
    Processes a file (PDF or Excel) to extract all text content,
    with enhanced cleaning for Excel data.
    Args:
        file_path: The path to the uploaded file.
    Returns:
        A string containing all text extracted from the file.
    """
    text = ""
    file_extension = os.path.splitext(file_path)[1].lower()

    try:
        if file_extension == ".pdf":
            pdf_reader = PdfReader(file_path)
            for page in pdf_reader.pages:
                text += page.extract_text()
        elif file_extension == ".xlsx":
            # Read all sheets from the Excel file
            xls = pd.ExcelFile(file_path)
            for sheet_name in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet_name)
                
                # --- START: Enhanced Excel Data Cleaning ---
                # Fill NaN values with an empty string for cleaner output
                df = df.fillna('')
                
                # Convert all columns to string type to avoid type-related errors during concatenation
                df = df.astype(str)

                # Option 1: Convert DataFrame to a simple string representation
                # This concatenates all values in a row, separated by spaces.
                # You might need to customize this based on your Excel structure.
                # For example, if you have columns like 'Course', 'Time', 'Room',
                # you might want to format it as: "Course: Math Time: 10 AM Room: A101"
                # For now, we'll just join all cells in a row.
                for index, row in df.iterrows():
                    # Join non-empty string values from the row, separated by a space
                    row_text = ' '.join(str(cell).strip() for cell in row if str(cell).strip())
                    if row_text: # Only add if the row is not empty after stripping
                        text += row_text + "\n"
                
                # Add a separator between sheets
                if sheet_name != xls.sheet_names[-1]: # Don't add after the last sheet
                    text += "\n--- End of Sheet: " + sheet_name + " ---\n\n"
                # --- END: Enhanced Excel Data Cleaning ---

        else:
            raise ValueError("Unsupported file type. Please upload a PDF or XLSX file.")
        
        # Basic text cleanup after extraction (for both PDF and Excel)
        text = text.replace('\n\n', '\n') # Reduce multiple newlines
        text = text.replace('  ', ' ') # Reduce multiple spaces
        text = text.strip() # Remove leading/trailing whitespace
        
        return text
    except Exception as e:
        print(f"Error processing file: {e}")
        traceback.print_exc()
        raise # Re-raise to be caught by the calling function

def create_qa_chain(raw_text):
    """
    Creates a RetrievalQA chain using Langchain, Groq, and FAISS.
    Args:
        raw_text: The extracted text from the file.
    Returns:
        A RetrievalQA chain ready to answer questions.
    """
    try:
        # Split text into chunks for vectorization
        splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        texts = splitter.split_text(raw_text)

        # Create embeddings using a pre-trained HuggingFace model
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        # Create a FAISS vector store from the text chunks and embeddings
        vectorstore = FAISS.from_texts(texts, embedding=embeddings)
        
        # Create a retriever to fetch relevant documents
        retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

        # Define the prompt template for the LLM
        prompt_template = """
        You are a helpful university assistant. Today is {current_datetime}.
        Answer the question based on the context from the university timetable.

        Context:
        {context}

        Question:
        {question}
        """
        QA_CHAIN_PROMPT = PromptTemplate(
            input_variables=["context", "question", "current_datetime"],
            template=prompt_template
        )

        # Initialize the Groq language model
        llm = ChatGroq(temperature=0, model_name="llama-3.1-8b-instant")
        
        # Create the RetrievalQA chain
        qa = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=retriever,
            chain_type="stuff",
            chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}, 
            input_key="question" # This tells the chain which input variable is the main query
        )
        return qa
    except Exception as e:
        print(f"Error creating QA chain: {e}")
        traceback.print_exc()
        raise # Re-raise to be caught by the calling function

# Step 2: Upload handler
def handle_upload(file):
    """
    Handles the file upload (PDF or Excel), processes it, and initializes the QA chain.
    Args:
        file: The uploaded file object from Gradio.
    Returns:
        A status message string.
    """
    global qa_chain
    try:
        raw_text = process_file(file.name) # Use the updated process_file
        qa_chain = create_qa_chain(raw_text)
        return "βœ… Timetable uploaded successfully. You can now ask questions!"
    except Exception as e:
        print(f"Error in handle_upload: {e}")
        traceback.print_exc()
        return f"❌ Failed to upload timetable: {e}"

# Step 3: Chat handler
def ask_question(query, history):
    """
    Handles user questions, queries the QA chain, and returns the answer.
    Args:
        query: The user's current question string.
        history: The conversation history (list of lists: [[user_msg, bot_msg], ...]).
                 Required by gr.ChatInterface, even if not directly used in RAG.
    Returns:
        The answer from the RAG model or an error message.
    """
    if not qa_chain:
        return "⚠️ Please upload a timetable PDF or Excel file first."
    
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        # Create a partially filled prompt with the current datetime for this specific query
        original_prompt = qa_chain.combine_documents_chain.llm_chain.prompt
        current_datetime_aware_prompt = original_prompt.partial(current_datetime=current_time)

        # Temporarily set the combine_documents_chain's prompt to the new, partial prompt
        qa_chain.combine_documents_chain.llm_chain.prompt = current_datetime_aware_prompt

        # Call the chain with only the 'question' key, as 'input_key="question"' is set on RetrievalQA.
        response_dict = qa_chain({"question": query})
        response = response_dict['result']

        # IMPORTANT: Restore the original prompt to avoid side effects for subsequent calls
        qa_chain.combine_documents_chain.llm_chain.prompt = original_prompt

        return response
    except Exception as e:
        # Catch any errors during the QA chain run and print them
        print(f"An error occurred during question answering: {e}")
        traceback.print_exc() # This will print the full error stack to your Colab console or Hugging Face logs
        return "An error occurred while processing your question. Please check the Google Colab console or Hugging Face Space logs for details."

# Gradio app interface definition
with gr.Blocks() as app:
    gr.Markdown("# 🏫 Univoid AI (Groq + Langchain + Gradio)")
    
    with gr.Row():
        # File upload component - now accepts PDF and XLSX
        file_input = gr.File(label="πŸ“„ Upload Timetable PDF or Excel (XLSX)", file_types=[".pdf", ".xlsx"])
        # Textbox to show upload status
        upload_output = gr.Textbox(label="πŸ“₯ Upload Status", interactive=False)

    # Event listener for file upload
    file_input.change(fn=handle_upload, inputs=file_input, outputs=upload_output)

    # Chatbot interface
    chatbot = gr.ChatInterface(fn=ask_question, title="🧠 Ask anything related Timetable")

# Launch the Gradio app
app.launch()