Spaces:
Sleeping
Sleeping
| import os | |
| from PyPDF2 import PdfReader | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_groq import ChatGroq | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate | |
| from datetime import datetime | |
| import gradio as gr | |
| import traceback # Import traceback for detailed error logging | |
| import pandas as pd # New: Import pandas for Excel processing | |
| # IMPORTANT: Replace "your_groq_api_key_here" with your actual Groq API key. | |
| # For Hugging Face Spaces, it's highly recommended to set this as a Space Secret | |
| # named GROQ_API_KEY. If set as a secret, you can simply use: | |
| # os.environ.get("GROQ_API_KEY") | |
| os.environ["GROQ_API_KEY"] = "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" | |
| # Global variable for QA chain | |
| qa_chain = None | |
| # Step 1: Process file (now supports PDF and Excel with enhanced cleaning) | |
| def process_file(file_path): | |
| """ | |
| Processes a file (PDF or Excel) to extract all text content, | |
| with enhanced cleaning for Excel data. | |
| Args: | |
| file_path: The path to the uploaded file. | |
| Returns: | |
| A string containing all text extracted from the file. | |
| """ | |
| text = "" | |
| file_extension = os.path.splitext(file_path)[1].lower() | |
| try: | |
| if file_extension == ".pdf": | |
| pdf_reader = PdfReader(file_path) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| elif file_extension == ".xlsx": | |
| # Read all sheets from the Excel file | |
| xls = pd.ExcelFile(file_path) | |
| for sheet_name in xls.sheet_names: | |
| df = pd.read_excel(xls, sheet_name=sheet_name) | |
| # --- START: Enhanced Excel Data Cleaning --- | |
| # Fill NaN values with an empty string for cleaner output | |
| df = df.fillna('') | |
| # Convert all columns to string type to avoid type-related errors during concatenation | |
| df = df.astype(str) | |
| # Option 1: Convert DataFrame to a simple string representation | |
| # This concatenates all values in a row, separated by spaces. | |
| # You might need to customize this based on your Excel structure. | |
| # For example, if you have columns like 'Course', 'Time', 'Room', | |
| # you might want to format it as: "Course: Math Time: 10 AM Room: A101" | |
| # For now, we'll just join all cells in a row. | |
| for index, row in df.iterrows(): | |
| # Join non-empty string values from the row, separated by a space | |
| row_text = ' '.join(str(cell).strip() for cell in row if str(cell).strip()) | |
| if row_text: # Only add if the row is not empty after stripping | |
| text += row_text + "\n" | |
| # Add a separator between sheets | |
| if sheet_name != xls.sheet_names[-1]: # Don't add after the last sheet | |
| text += "\n--- End of Sheet: " + sheet_name + " ---\n\n" | |
| # --- END: Enhanced Excel Data Cleaning --- | |
| else: | |
| raise ValueError("Unsupported file type. Please upload a PDF or XLSX file.") | |
| # Basic text cleanup after extraction (for both PDF and Excel) | |
| text = text.replace('\n\n', '\n') # Reduce multiple newlines | |
| text = text.replace(' ', ' ') # Reduce multiple spaces | |
| text = text.strip() # Remove leading/trailing whitespace | |
| return text | |
| except Exception as e: | |
| print(f"Error processing file: {e}") | |
| traceback.print_exc() | |
| raise # Re-raise to be caught by the calling function | |
| def create_qa_chain(raw_text): | |
| """ | |
| Creates a RetrievalQA chain using Langchain, Groq, and FAISS. | |
| Args: | |
| raw_text: The extracted text from the file. | |
| Returns: | |
| A RetrievalQA chain ready to answer questions. | |
| """ | |
| try: | |
| # Split text into chunks for vectorization | |
| splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len | |
| ) | |
| texts = splitter.split_text(raw_text) | |
| # Create embeddings using a pre-trained HuggingFace model | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| # Create a FAISS vector store from the text chunks and embeddings | |
| vectorstore = FAISS.from_texts(texts, embedding=embeddings) | |
| # Create a retriever to fetch relevant documents | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) | |
| # Define the prompt template for the LLM | |
| prompt_template = """ | |
| You are a helpful university assistant. Today is {current_datetime}. | |
| Answer the question based on the context from the university timetable. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| """ | |
| QA_CHAIN_PROMPT = PromptTemplate( | |
| input_variables=["context", "question", "current_datetime"], | |
| template=prompt_template | |
| ) | |
| # Initialize the Groq language model | |
| llm = ChatGroq(temperature=0, model_name="llama-3.1-8b-instant") | |
| # Create the RetrievalQA chain | |
| qa = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| retriever=retriever, | |
| chain_type="stuff", | |
| chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}, | |
| input_key="question" # This tells the chain which input variable is the main query | |
| ) | |
| return qa | |
| except Exception as e: | |
| print(f"Error creating QA chain: {e}") | |
| traceback.print_exc() | |
| raise # Re-raise to be caught by the calling function | |
| # Step 2: Upload handler | |
| def handle_upload(file): | |
| """ | |
| Handles the file upload (PDF or Excel), processes it, and initializes the QA chain. | |
| Args: | |
| file: The uploaded file object from Gradio. | |
| Returns: | |
| A status message string. | |
| """ | |
| global qa_chain | |
| try: | |
| raw_text = process_file(file.name) # Use the updated process_file | |
| qa_chain = create_qa_chain(raw_text) | |
| return "β Timetable uploaded successfully. You can now ask questions!" | |
| except Exception as e: | |
| print(f"Error in handle_upload: {e}") | |
| traceback.print_exc() | |
| return f"β Failed to upload timetable: {e}" | |
| # Step 3: Chat handler | |
| def ask_question(query, history): | |
| """ | |
| Handles user questions, queries the QA chain, and returns the answer. | |
| Args: | |
| query: The user's current question string. | |
| history: The conversation history (list of lists: [[user_msg, bot_msg], ...]). | |
| Required by gr.ChatInterface, even if not directly used in RAG. | |
| Returns: | |
| The answer from the RAG model or an error message. | |
| """ | |
| if not qa_chain: | |
| return "β οΈ Please upload a timetable PDF or Excel file first." | |
| current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| try: | |
| # Create a partially filled prompt with the current datetime for this specific query | |
| original_prompt = qa_chain.combine_documents_chain.llm_chain.prompt | |
| current_datetime_aware_prompt = original_prompt.partial(current_datetime=current_time) | |
| # Temporarily set the combine_documents_chain's prompt to the new, partial prompt | |
| qa_chain.combine_documents_chain.llm_chain.prompt = current_datetime_aware_prompt | |
| # Call the chain with only the 'question' key, as 'input_key="question"' is set on RetrievalQA. | |
| response_dict = qa_chain({"question": query}) | |
| response = response_dict['result'] | |
| # IMPORTANT: Restore the original prompt to avoid side effects for subsequent calls | |
| qa_chain.combine_documents_chain.llm_chain.prompt = original_prompt | |
| return response | |
| except Exception as e: | |
| # Catch any errors during the QA chain run and print them | |
| print(f"An error occurred during question answering: {e}") | |
| traceback.print_exc() # This will print the full error stack to your Colab console or Hugging Face logs | |
| return "An error occurred while processing your question. Please check the Google Colab console or Hugging Face Space logs for details." | |
| # Gradio app interface definition | |
| with gr.Blocks() as app: | |
| gr.Markdown("# π« Univoid AI (Groq + Langchain + Gradio)") | |
| with gr.Row(): | |
| # File upload component - now accepts PDF and XLSX | |
| file_input = gr.File(label="π Upload Timetable PDF or Excel (XLSX)", file_types=[".pdf", ".xlsx"]) | |
| # Textbox to show upload status | |
| upload_output = gr.Textbox(label="π₯ Upload Status", interactive=False) | |
| # Event listener for file upload | |
| file_input.change(fn=handle_upload, inputs=file_input, outputs=upload_output) | |
| # Chatbot interface | |
| chatbot = gr.ChatInterface(fn=ask_question, title="π§ Ask anything related Timetable") | |
| # Launch the Gradio app | |
| app.launch() | |