File size: 4,126 Bytes
d3bfde1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import tempfile
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from io import BytesIO
from langchain.document_loaders import PyPDFLoader
from transformers import pipeline
from langchain.schema import Document
from dotenv import load_dotenv
from transformers import AutoTokenizer
import transformers
import torch

# Load environment variables from Hugging Face Secrets
load_dotenv()

os.environ['HUGGINGFACE_API_KEY'] = os.getenv("HF_TOKEN")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"]="Research-Paper-Summarizer"

# Streamlit Page Config
st.set_page_config(
    page_title="Research Paper Summarizer",
    layout="centered"
)

st.title("📚 Research Paper Summarizer - Using Open Source Models")

# File Uploader
uploaded_files = st.file_uploader(
    "Upload one or more research PDFs",
    type=["pdf"],
    accept_multiple_files=True
)

# A placeholder to store vector database (FAISS)
if "vector_store" not in st.session_state:
    st.session_state.vector_store = None

# Hugging Face LLM Model Pipeline
def get_huggingface_pipeline():
    
    model_name = "meta-llama/Llama-3.2-1B"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    st.info("Loading Hugging Face Model... Please wait.")
    
    return transformers.pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        max_new_tokens=256, 
        torch_dtype=torch.bfloat16
)


# Process the PDFs, Create/Update the Vector Store
if st.button("Process PDFs") and uploaded_files:
    all_documents = []

    for file in uploaded_files:
        # Save the file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file.getvalue())
            temp_file_path = temp_file.name

        # Load the PDF using PyPDFLoader
        loader = PyPDFLoader(temp_file_path)
        pdf_docs = loader.load()

        # Split text into manageable chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=300,
            separators=["\n\n", "\n", " ", ""]
        )

        for doc in pdf_docs:
            chunks = text_splitter.split_text(doc.page_content)
            for chunk in chunks:
                # Create Document object for each chunk
                all_documents.append(Document(page_content=chunk, metadata=doc.metadata))

    # Create embeddings with Hugging Face
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    st.session_state.vector_store = FAISS.from_documents(
        documents=all_documents,
        embedding=embeddings
    )

    st.success("PDFs processed and vector store created!")

# Query + Summarize
query = st.text_input("Enter your question or summary request:")

if st.button("Get Summary/Answer"):
    if st.session_state.vector_store is None:
        st.warning("Please upload and process PDFs first.")
    else:
        retriever = st.session_state.vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        )

        # Use Hugging Face LLM
        hf_pipeline = get_huggingface_pipeline()

        # Retrieve documents and generate response
        relevant_docs = retriever.get_relevant_documents(query)
        context_text = "\n".join([doc.page_content for doc in relevant_docs])

        # Generate answer using Hugging Face model
        response = hf_pipeline(f"Context: {context_text}\nQuestion: {query}", num_return_sequences=1)

        st.markdown("### Answer:")
        st.write(response[0]['generated_text'])

        with st.expander("Show source documents"):
            for i, doc in enumerate(relevant_docs):
                st.markdown(f"**Source Document {i + 1}:**")
                st.write(doc.page_content)
                st.write("---")