Spaces:
Build error
Build error
| import os | |
| import tempfile | |
| import streamlit as st | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.chains import RetrievalQA | |
| from io import BytesIO | |
| from langchain.document_loaders import PyPDFLoader | |
| from transformers import pipeline | |
| from langchain.schema import Document | |
| from dotenv import load_dotenv | |
| from transformers import AutoTokenizer | |
| import transformers | |
| import torch | |
| # Load environment variables from Hugging Face Secrets | |
| load_dotenv() | |
| os.environ['HUGGINGFACE_API_KEY'] = os.getenv("HF_TOKEN") | |
| os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY") | |
| os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
| os.environ["LANGCHAIN_PROJECT"]="Research-Paper-Summarizer" | |
| # Streamlit Page Config | |
| st.set_page_config( | |
| page_title="Research Paper Summarizer", | |
| layout="centered" | |
| ) | |
| st.title("๐ Research Paper Summarizer - Using Open Source Models") | |
| # File Uploader | |
| uploaded_files = st.file_uploader( | |
| "Upload one or more research PDFs", | |
| type=["pdf"], | |
| accept_multiple_files=True | |
| ) | |
| # A placeholder to store vector database (FAISS) | |
| if "vector_store" not in st.session_state: | |
| st.session_state.vector_store = None | |
| # Hugging Face LLM Model Pipeline | |
| def get_huggingface_pipeline(): | |
| model_name = "meta-llama/Llama-3.2-1B" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| st.info("Loading Hugging Face Model... Please wait.") | |
| return transformers.pipeline( | |
| "text-generation", | |
| model=model_name, | |
| tokenizer=tokenizer, | |
| max_new_tokens=256, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| # Process the PDFs, Create/Update the Vector Store | |
| if st.button("Process PDFs") and uploaded_files: | |
| all_documents = [] | |
| for file in uploaded_files: | |
| # Save the file temporarily | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | |
| temp_file.write(file.getvalue()) | |
| temp_file_path = temp_file.name | |
| # Load the PDF using PyPDFLoader | |
| loader = PyPDFLoader(temp_file_path) | |
| pdf_docs = loader.load() | |
| # Split text into manageable chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=300, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| for doc in pdf_docs: | |
| chunks = text_splitter.split_text(doc.page_content) | |
| for chunk in chunks: | |
| # Create Document object for each chunk | |
| all_documents.append(Document(page_content=chunk, metadata=doc.metadata)) | |
| # Create embeddings with Hugging Face | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| st.session_state.vector_store = FAISS.from_documents( | |
| documents=all_documents, | |
| embedding=embeddings | |
| ) | |
| st.success("PDFs processed and vector store created!") | |
| # Query + Summarize | |
| query = st.text_input("Enter your question or summary request:") | |
| if st.button("Get Summary/Answer"): | |
| if st.session_state.vector_store is None: | |
| st.warning("Please upload and process PDFs first.") | |
| else: | |
| retriever = st.session_state.vector_store.as_retriever( | |
| search_type="similarity", | |
| search_kwargs={"k": 5} | |
| ) | |
| # Use Hugging Face LLM | |
| hf_pipeline = get_huggingface_pipeline() | |
| # Retrieve documents and generate response | |
| relevant_docs = retriever.get_relevant_documents(query) | |
| context_text = "\n".join([doc.page_content for doc in relevant_docs]) | |
| # Generate answer using Hugging Face model | |
| response = hf_pipeline(f"Context: {context_text}\nQuestion: {query}", num_return_sequences=1) | |
| st.markdown("### Answer:") | |
| st.write(response[0]['generated_text']) | |
| with st.expander("Show source documents"): | |
| for i, doc in enumerate(relevant_docs): | |
| st.markdown(f"**Source Document {i + 1}:**") | |
| st.write(doc.page_content) | |
| st.write("---") |