Spaces:

meesamraza
/

document_gpt

Sleeping

File size: 4,730 Bytes

c608c63
11318ba
8796412
11318ba
 
235deb2
11318ba
235deb2
a12e1b9
11318ba
 
a12e1b9
c608c63
e5f5057
c608c63
e5f5057
c608c63
 
e5f5057
 
 
11318ba
 
 
 
 
e5f5057
 
 
11318ba
 
f4cfcfd
11318ba
235deb2
f4cfcfd
235deb2
 
 
 
f4cfcfd
11318ba
e5f5057
 
 
11318ba
 
 
 
 
 
 
c563266
11318ba
e5f5057
 
 
11318ba
235deb2
e5f5057
11318ba
e5f5057
 
 
11318ba
8796412
 
 
 
 
 
 
11318ba
e5f5057
11318ba
e5f5057
11318ba
 
f4cfcfd
e5f5057
 
f4cfcfd
235deb2
8796412
 
f4cfcfd
11318ba
e5f5057
11318ba
e5f5057
 
 
11318ba
8e14024
0d23722
a09f616
11318ba
e5f5057
11318ba
 
f4cfcfd
 
11318ba
e5f5057
11318ba
e5f5057
 
 
 
 
f4cfcfd
 
e5f5057
 
 
 
f4cfcfd
e5f5057
 
 
 
 
 
 
 
 
 
 
 
 
 
11318ba
e5f5057

import os
import logging
import time
from dotenv import load_dotenv
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq

# --------------------------
# Load environment variables
# --------------------------
load_dotenv()

# --------------------------
# Logging configuration
# --------------------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# --------------------------
# PDF text extraction
# --------------------------
def get_pdf_text(pdf_docs):
    text = ""
    page_count = 0
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        page_count += len(pdf_reader.pages)
        for page in pdf_reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text, page_count

# --------------------------
# Text chunking
# --------------------------
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)

# --------------------------
# FAISS VectorStore creation
# --------------------------
def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

# --------------------------
# Conversation chain
# --------------------------
def get_conversation_chain(vectorstore):
    llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )

# --------------------------
# Handle user input
# --------------------------
def handle_userinput(user_question):
    if st.session_state.conversation is not None:
        start_time = time.time()
        with st.spinner("🤖 Thinking..."):
            response = st.session_state.conversation({'question': user_question})
        elapsed_time = round(time.time() - start_time, 2)

        # Show response only (no chat history)
        st.markdown(f"**🤖 Bot:** {response['answer']}")
        st.info(f"⏱ Response Time: {elapsed_time}s | 📄 Words: {len(response['answer'].split())}")
    else:
        st.warning("⚠ Please process the documents first.")

# --------------------------
# Main Streamlit App
# --------------------------
def main():
    st.set_page_config(page_title="InfinaDocs Knowledge Sphere", page_icon="📚", layout="wide")
    st.title("📚 InfinaDocs Knowledge Sphere")
    st.markdown("Chat with your documents using **LLaMA 3.3** and **Groq AI**. 🚀")

    # Session state initialization
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "pages_processed" not in st.session_state:
        st.session_state.pages_processed = 0

    # Sidebar - Upload PDFs
    with st.sidebar:
        st.header("📂 Upload & Process")
        pdf_docs = st.file_uploader("Upload PDFs", accept_multiple_files=True, type=["pdf"])
        if st.button("🚀 Process Documents"):
            if pdf_docs:
                with st.spinner("📖 Reading & Processing..."):
                    raw_text, page_count = get_pdf_text(pdf_docs)
                    st.session_state.pages_processed = page_count
                    if raw_text.strip():
                        text_chunks = get_text_chunks(raw_text)
                        vectorstore = get_vectorstore(text_chunks)
                        st.session_state.conversation = get_conversation_chain(vectorstore)
                        st.success(f"✅ {len(pdf_docs)} file(s) processed | 📄 {page_count} pages")
                    else:
                        st.error("No valid text found in PDFs.")
            else:
                st.warning("Please upload at least one PDF.")

    # Main Chat Section
    st.subheader("💬 Ask a Question")
    user_question = st.text_input("Type your question here...")
    if st.button("Submit Question"):
        if user_question.strip():
            handle_userinput(user_question)
        else:
            st.warning("Please enter a question before submitting.")

if __name__ == '__main__':
    main()