import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load pre-trained model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to process uploaded files
def process_file(uploaded_file):
    try:
        if uploaded_file.name.endswith('.xlsx') or uploaded_file.name.endswith('.xls'):
            df = pd.read_excel(uploaded_file)
        elif uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        elif uploaded_file.name.endswith('.pdf'):
            from PyPDF2 import PdfReader
            reader = PdfReader(uploaded_file)
            text = "".join(page.extract_text() for page in reader.pages)
            # Assuming a format with Name, Grade, Marks per line
            df = pd.DataFrame([row.split() for row in text.splitlines()], columns=["Name", "Grade", "Marks"])
            # Convert marks column to numeric
            df["Marks"] = pd.to_numeric(df["Marks"], errors='coerce')
        else:
            st.error("Unsupported file format. Please upload Excel, CSV, or PDF.")
            return None

        # Clean column names (remove spaces or fix names)
        df.columns = df.columns.str.strip()
        st.write("DataFrame columns after cleaning:", df.columns)

        return df

    except Exception as e:
        st.error(f"Error processing file: {e}")
        return None

# Main app
def main():
    st.title("School Performance Analysis App")
    st.write("Upload a document containing student grades and marks to analyze their performance.")
    
    uploaded_file = st.file_uploader("Upload Excel, CSV, or PDF file", type=["xlsx", "xls", "csv", "pdf"])
    
    if uploaded_file:
        df = process_file(uploaded_file)
        if df is not None:
            st.subheader("Uploaded Data")
            st.write(df.head())

            # Ensure the required columns exist (excluding "S.No.", "Science", "History", "Islamiat", "Geography")
            required_columns = ["Name", "Class", "Maths", "Urdu", "English"]
            missing_columns = [col for col in required_columns if col not in df.columns]
            
            if missing_columns:
                st.error(f"Missing columns: {', '.join(missing_columns)}. Please check your data.")
                return

            # Convert marks columns to numeric (if not already numeric)
            marks_columns = ["Maths", "Urdu", "English"]
            for col in marks_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Calculate the Total Marks (sum of all subject columns)
            df['Total Marks'] = df[marks_columns].sum(axis=1)

            # Show the top 10 students based on Total Marks
            top_performers = df.sort_values(by="Total Marks", ascending=False).head(10)
            st.subheader("Top 10 Students Based on Total Marks")
            st.write(top_performers[["Name", "Class", "Total Marks"]])

            # Add embedding column (for searching students by name or details)
            df['Embedding'] = df.apply(lambda row: model.encode(f"{row['Name']} {row['Class']} {row['Maths']} {row['Urdu']} {row['English']}"), axis=1)

            # Search functionality
            st.subheader("Search for a Student")
            search_query = st.text_input("Enter the student's name:")
            
            if search_query:
                # Find the most similar student based on embeddings
                search_embedding = model.encode(search_query)
                df['Similarity'] = df['Embedding'].apply(lambda emb: (emb @ search_embedding) / (emb.dot(emb) ** 0.5))
                # Get the student with the highest similarity score
                result = df.sort_values(by="Similarity", ascending=False).iloc[0]
                st.write("Search Result:")
                st.write(result[["Name", "Class", "Maths", "Urdu", "English", "Total Marks"]])
            
            # Show the updated data with Total Marks column
            st.subheader("Updated Data with Total Marks")
            st.write(df)

if __name__ == "__main__":
    main()