# app.py
import os
import gradio as gr

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader
import pypdf
from langchain.prompts import PromptTemplate
from huggingface_hub import upload_folder
from huggingface_hub import HfApi, upload_file
from huggingface_hub import hf_hub_download
from huggingface_hub import (
    file_exists, 
    upload_file, 
    repo_exists, 
    create_repo,
    hf_hub_download
)
import shutil
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

# Optional: Set HF Token if needed-allWrite
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HF_TOKEN")
api = HfApi(token=os.getenv("HF_TOKEN"))  # Replace with your token
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


# Store the vraiables globally (across UI events)
qa_chain = None
qa_chain1 = None
llm=None
llm1=None
repo_id=os.getenv("reposit_id")

#=============================================google/flan-t5-small
# Load HF model (lightweight for CPU)
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Wrap in pipeline
#pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
if llm1 is None:
    llm1 = HuggingFacePipeline(pipeline=pipe1)

#=============================================TinyLlama/TinyLlama-1.1B-Chat-v1.0
# Create optimized pipeline for TinyLlama
pipe = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    tokenizer=AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
    device_map="auto" if torch.cuda.is_available() else None,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=1.15,
    pad_token_id=tokenizer.eos_token_id if 'tokenizer' in locals() else 128001,
    trust_remote_code=True
)

# Build LangChain LLM wrapper
if llm is None:
    llm = HuggingFacePipeline(pipeline=pipe)
#=============================================

def format_as_bullets(text):
    """Convert answer to bullet points"""
    lines = text.strip().split('\n')
    bullet_lines = [f"• {line.strip()}" for line in lines if line.strip()]
    return '\n'.join(bullet_lines) if bullet_lines else text
#=============================================

def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    """Create FAISS index from PDF and upload to HF dataset repo"""
    message = "Index creation started"
    
    try:
        # Step 1: Create proper embeddings object (CRITICAL FIX)
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        
        # Step 2: Clean temp directory
        if os.path.exists("temp_faiss"):
            shutil.rmtree("temp_faiss")
        
        # Step 3: Try PyPDFLoader first
        loader = PyPDFLoader(file)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        new_docs = text_splitter.split_documents(documents)
        db = FAISS.from_documents(new_docs, embeddings)
        db.save_local("temp_faiss")
        # After db.save_local("temp_faiss")...
        
        # Step 4: Upload to HF Hub
        api = HfApi(token=os.getenv("HF_TOKEN"))
        api.upload_file(path_or_fileobj=file, path_in_repo=f"docs/{os.path.basename(file)}", repo_id=repo_id, repo_type="dataset")
        api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset")
        api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset")
        
        message = "✅ Index created successfully with PyPDFLoader and uploaded to repo"
        
    except Exception as e1:
        try:
            print(f"PyPDFLoader failed: {e1}")
            
            # Step 5: Fallback to PyMuPDFLoader
            loader = PyMuPDFLoader(file)
            documents = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            new_docs = text_splitter.split_documents(documents)
            
            # Use same embeddings instance
            embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
            db = FAISS.from_documents(new_docs, embeddings)
            db.save_local("temp_faiss")
            
            # Upload
            api = HfApi(token=os.getenv("HF_TOKEN"))
            api.upload_file(path_or_fileobj=file, path_in_repo=f"docs/{os.path.basename(file)}", repo_id=repo_id, repo_type="dataset")
            api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset")
            api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset")
            
            message = f"✅ PyPDFLoader failed ({e1}), PyMuPDFLoader succeeded and uploaded to repo"
            
        except Exception as e2:
            message = f"❌ Both loaders failed. PyPDF: {e1}, PyMuPDF: {e2}"
    
    finally:
        # Cleanup
        if os.path.exists("temp_faiss"):
            shutil.rmtree("temp_faiss")
    
    return message

# Usage
#result = create_faiss_index("your_username/your-dataset", "path/to/your/file.pdf")
#print(result)
#=============
def update_faiss_from_hf(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    """Load existing FAISS from HF, add new docs, push updated version."""
    message = ""
    
    try:
        # Step 1: Create embeddings
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        
        # Step 2: Download existing FAISS files
        print("Downloading existing FAISS index...")
        faiss_path = hf_hub_download(repo_id=repo_id, filename="index.faiss", repo_type="dataset")
        pkl_path = hf_hub_download(repo_id=repo_id, filename="index.pkl", repo_type="dataset")
        
        # Step 3: Load existing vectorstore
        folder_path = os.path.dirname(faiss_path)
        vectorstore = FAISS.load_local(
            folder_path=folder_path, 
            embeddings=embeddings, 
            allow_dangerous_deserialization=True
        )
        message += f"✅ Loaded existing index with {vectorstore.index.ntotal} vectors\n"
        
        # Step 4: Load new document with fallback
        documents = None
        loaders = [
            ("PyPDFLoader", PyPDFLoader),
            ("PyMuPDFLoader", PyMuPDFLoader)
        ]
        
        for loader_name, LoaderClass in loaders:
            try:
                print(f"Trying {loader_name}...")
                loader = LoaderClass(file)
                documents = loader.load()
                message += f"✅ Loaded {len(documents)} pages with {loader_name}\n"
                break
            except Exception as e:
                message += f"❌ {loader_name} failed: {str(e)[:100]}...\n"
                continue
        
        if documents is None:
            return "❌ All PDF loaders failed"
        
        # Step 5: Split documents
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        new_docs = text_splitter.split_documents(documents)
        message += f"✅ Created {len(new_docs)} chunks from new document\n"
        
        # Step 6: Add new documents to existing index
        vectorstore.add_documents(new_docs)
        message += f"✅ Added to index. New total: {vectorstore.index.ntotal} vectors\n"
        
        # Step 7: Save updated index
        temp_dir = "temp_faiss_update"
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        vectorstore.save_local(temp_dir)
        
        # Step 8: Upload updated files
        api = HfApi(token=os.getenv("HF_TOKEN"))  # Replace with your token
        api.upload_file(
            path_or_fileobj=file, 
            path_in_repo=f"docs/{os.path.basename(file)}", 
            repo_id=repo_id, 
            repo_type="dataset"
        )
        api.upload_file(
            path_or_fileobj=f"{temp_dir}/index.faiss", 
            path_in_repo="index.faiss", 
            repo_id=repo_id, 
            repo_type="dataset"
        )
        api.upload_file(
            path_or_fileobj=f"{temp_dir}/index.pkl", 
            path_in_repo="index.pkl", 
            repo_id=repo_id, 
            repo_type="dataset"
        )
        
        message += f"✅ Successfully updated repo with {len(new_docs)} new chunks!"
        
    except Exception as e:
        message += f"❌ Update failed: {str(e)}"
    
    finally:
        # Cleanup
        if os.path.exists("temp_faiss_update"):
            shutil.rmtree("temp_faiss_update")
    
    return message

# Usage
# result = update_faiss_from_hf("yourusername/my-faiss-store", "new_document.pdf")
# print(result)
#====================
def upload_and_prepare(file, user):
    mm = ""
    pdf_links = "**No PDFs**"
    
    if user != os.getenv("uploading_password"):
        return "❌ Unauthorized User", pdf_links
    
    try:
        if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
            mm = update_faiss_from_hf(repo_id, file)
        else:
            mm = create_faiss_index(repo_id, file)
        
        # NOW this runs - generate PDF list
        api = HfApi(token=os.getenv("HF_TOKEN"))
        pdf_files = api.list_repo_files(repo_id, repo_type="dataset")
        pdf_links = "\n".join([f"• [📄 {f}](https://huggingface.co/datasets/{repo_id}/resolve/main/{f})" 
                              for f in pdf_files if f.endswith('.pdf')])
    except Exception as e:
        mm += f"\n❌ Error: {e}"
    
    return mm, pdf_links


#============
def upload_and_prepare_old(file,user):
    #==============================
    #=============================
  # Load & split document
  mm=""
  if user == os.getenv("uploading_password"):
    if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
      mm=update_faiss_from_hf(repo_id, file)
      #mm="✅ Document processed. New index added. You can now ask questions!"
    if not file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"):
      mm=create_faiss_index(repo_id, file)
      #mm="✅ Document processed. New index created. You can now ask questions!"
  else:
    mm="❌ Unauthorized User"
  # After successful upload
  api = HfApi(token=os.getenv("HF_TOKEN"))  # Replace with your token
  pdf_files = api.list_repo_files(repo_id, repo_type="dataset")
  pdf_links = "\n".join([f"• [📄 {f}](https://huggingface.co/datasets/{repo_id}/resolve/main/{f})" 
                      for f in pdf_files if f.endswith('.pdf')])
  return mm, pdf_links  # Update both outputs
  #return mm
#create_faiss_index(repo_id, file_input)
#======================================================================
def generate_qa_chain(repo_id, embedding_model="sentence-transformers/all-MiniLM-L6-v2", llm=None):
    """
    Generate QA chain from HF dataset repo FAISS index
    """
    try:
      # Step 1: Create embeddings (FIX: was missing)
      embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
      
      # Step 2: Download FAISS files from HF Hub
      faiss_path = hf_hub_download(
          repo_id=repo_id, 
          filename="index.faiss", 
          repo_type="dataset"
      )
      pkl_path = hf_hub_download(
          repo_id=repo_id, 
          filename="index.pkl", 
          repo_type="dataset"
      )
      
      # Step 3: Load FAISS vectorstore (FIX: pass embeddings object, not string)
      folder_path = os.path.dirname(faiss_path)
      vectorstore = FAISS.load_local(
          folder_path=folder_path, 
          embeddings=embeddings,  # FIXED: was 'embedding_model' string
          allow_dangerous_deserialization=True
      )
      
      # Step 4: Create retriever
      retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
      
      # Step 5: Custom prompt template
      prompt_template = PromptTemplate(
          input_variables=["context", "question"],
          template="""
          Answer strictly based on the context below.
          Mention rule number / circular reference and **PAGE NUMBER**..
          Add interpretation.

          If answer is not found, say "Not available in the provided context".

          Question: {question}

          Context: {context}

          Answer (include page references):
          """
      )
      
      # Step 6: Setup RetrievalQA chain
      qa_chain = RetrievalQA.from_chain_type(
          llm=llm,  # Make sure llm is passed or defined globally
          chain_type="stuff",
          chain_type_kwargs={"prompt": prompt_template},
          retriever=retriever,
          return_source_documents=True
      )
    except Exception as e:
      print(f"Error in generate_qa_chain: {e}")
      return None
    return qa_chain

# Usage example:
# llm = HuggingFacePipeline(...)  # Your LLM setup
# qa = generate_qa_chain("your_username/your-dataset", llm=llm)
# result = qa.invoke({"query": "What is the main rule?"})
# print(result["result"])

#============================
def bePrepare():
    global qa_chain
    qa_chain = generate_qa_chain(repo_id,llm=llm)
    return "I am ready, ask me questions with model tiny Lama."

def bePrepare1():
    global qa_chain1
    qa_chain1 = generate_qa_chain(repo_id,llm=llm1)
    return "I am ready, ask me questions with model google flan-t5."

def ask_question(query):
    if not query or not qa_chain:
        return "❌ Please click prepare button first and check whether question is empty"

    response = qa_chain.invoke({"query": query})
    result = response["result"]
    bullet_result = format_as_bullets(result)
    sources = response.get("source_documents", [])
    
    source_info = ""
    for i, doc in enumerate(sources[:3]):
        page_num = doc.metadata.get('page', 'Unknown')
        filename = os.path.basename(doc.metadata.get('source', 'Unknown'))
        repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
        source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"

    return f"{result}\n\n In bullet form \n{bullet_result}\n\n**📄 Sources:**{source_info}"

def ask_question1(query):
    if not query or not qa_chain1:
        return "❌ Please click prepare button first and check whether question is empty"

    response = qa_chain1.invoke({"query": query})
    result = response["result"]
    bullet_result = format_as_bullets(result)
    sources = response.get("source_documents", [])
    
    source_info = ""
    for i, doc in enumerate(sources[:3]):
        page_num = doc.metadata.get('page', 'Unknown')
        filename = os.path.basename(doc.metadata.get('source', 'Unknown'))
        repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
        source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"

    return f"{result}\n\n In bullet form \n{bullet_result}\n\n**📄 Sources:**{source_info}"
#===============================================
#delete entire repo
def delete_entire_repo(user):
    mx="Unauthorized user."
    repo=os.getenv("reposit_id")
    if user != os.getenv("uploading_password"):
        return "❌ Unauthorized user"
    try:
        api = HfApi(token=os.getenv("HF_TOKEN"))
        api.delete_repo(repo_id=repo, repo_type="dataset")
        api.create_repo(repo_id=repo, repo_type="dataset", private=False)
        return f"✅ Repo {repo_id} reset successfully"
    except Exception as e:
            mx=f"❌ error during deletetion & creation of repo: {e} "
#===============================================
# ❌ Static (never updates)
# pdf_list = gr.Markdown("**No documents loaded yet.**")

# ✅ Dynamic function
def get_pdf_list():
    repo_id=os.getenv("reposit_id")
    try:
        
        api = HfApi(token=os.getenv("HF_TOKEN"))
        files = api.list_repo_files(repo_id, repo_type="dataset")
        
        pdf_files = [f for f in files if f.endswith('.pdf')]
        if not pdf_files:
            return "**No PDF documents in repo yet.**"
        
        links = []
        for pdf in pdf_files:
            url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{pdf}"
            links.append(f"• [📄 {os.path.basename(pdf)}]({url})")
        
        return f"**📚 Uploaded PDFs ({len(pdf_files)}):**\n" + "\n".join(links)
    except Exception as ee:
        print (ee)
        return f"**❌ Cannot load PDF list**error: {ee}"

#===============================================
# Gradio UI
with gr.Blocks(title="N R L C H A T B O T - for commercial procurement - Supply", css="""
    #blue-col { background: linear-gradient(135deg, #667eea, #764ba2); padding: 20px; border-radius: 10px; }
    #green-col { background: #4ecdc4; padding: 20px; border-radius: 10px; }
""") as demo:
    gr.Markdown("## 🧠 For use of NRL procurement department Only")
    with gr.Row():
        # LEFT COLUMN: TinyLama Model
        with gr.Column(elem_id="blue-col",scale=1):
            gr.Markdown("## 🧠 Using heavy TinyLama Model")
            with gr.Row():
                Index_processing_output=gr.Textbox(label="📁 Status for tiny lama", interactive=False)
            with gr.Row():
                Index_processing_btn = gr.Button("🔄 Clik to get the udated resources with tiny Lama")
                Index_processing_btn.click(bePrepare, inputs=None, outputs=Index_processing_output)
            with gr.Row():
                query_input = gr.Textbox(label="❓ Your Question pls")
            with gr.Row():
                query_btn = gr.Button("🧠 Get Answer")
            with gr.Row():
                answer_output = gr.Textbox(
                    label="✅ Answer with Document Links", 
                    lines=8
                )
                query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
        # RIGHT COLUMN: google\flan-t5
        with gr.Column(elem_id="green-col",scale=2):
            gr.Markdown("## 🧠 Using ligth model - google flan-t5")
            Index_processing_output1=gr.Textbox(label="📁 Status for google flan-t5", interactive=False)
            Index_processing_btn1 = gr.Button("🔄 Clik to get the udated resources with google flan-t5")
            Index_processing_btn1.click(bePrepare1, inputs=None, outputs=Index_processing_output1)
            query_input1 = gr.Textbox(label="❓ Your Question pls")
            query_btn1 = gr.Button("🧠 Get Answer")
            answer_output1 = gr.Textbox(
                label="✅ Answer with Document Links", 
                lines=8
            )
            summary_output = gr.Markdown("**Summary will appear here**")
            query_btn1.click(
                ask_question1, 
                inputs=query_input1, 
                outputs=answer_output1
            )    
    
    with gr.Row():
         # LEFT COLUMN: Document Management
        with gr.Column(elem_id="green-col",scale=1):
            gr.Markdown("## 📚 Uploaded Documents")
            with gr.Row():                
                pdf_list = gr.Markdown("**No documents loaded yet.**")
                refresh_btn = gr.Button("🔄 Refresh")
                refresh_btn.click(get_pdf_list,inputs=None,outputs=pdf_list)
        with gr.Column(elem_id="blue-col",scale=1):
            gr.Markdown("## 🧠 For uploading new PDF documents.")
            with gr.Row():        
                output_msg = gr.Textbox(label="📁 Authorization Message", interactive=False)
            with gr.Row():
                file_input = gr.File(label="📄 Upload .pdf File by only authorized user", type="filepath")
            with gr.Row():
                authorized_user=gr.Textbox(label="Write the password to upload new Circular Doc.")                
            with gr.Row():
                upload_btn = gr.Button("🔄 Process Doc")            
                upload_btn.click(upload_and_prepare, inputs=[file_input,authorized_user], outputs=[output_msg,pdf_list])
            with gr.Row():
                delete_btn = gr.Button("🔄 Delete complete repo")            
                delete_btn.click(delete_entire_repo, inputs=authorized_user, outputs=output_msg)


# For local dev use: demo.launch()
# For HF Spaces

if __name__ == "__main__":
    demo.launch()