msaifee commited on
Commit
362b129
·
verified ·
1 Parent(s): 73860f6

Summerizer using deepseek R1

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py CHANGED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import streamlit as st
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.schema import Document
9
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Streamlit Page Config
16
+ st.set_page_config(
17
+ page_title="Research Paper Summarizer",
18
+ layout="centered"
19
+ )
20
+
21
+ st.title("📚 Research Paper Summarizer with DeepSeekR1")
22
+
23
+ # Load DeepSeekR1 model
24
+ @st.cache_resource
25
+ def load_llm():
26
+ model_name = "togethercomputer/deepseekr-1"
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
29
+ return pipeline("text2text-generation", model=model, tokenizer=tokenizer)
30
+
31
+ llm_pipeline = load_llm()
32
+
33
+ # File Uploader
34
+ uploaded_files = st.file_uploader(
35
+ "Upload one or more research PDFs",
36
+ type=["pdf"],
37
+ accept_multiple_files=True
38
+ )
39
+
40
+ # Initialize vector store in session state
41
+ if "vector_store" not in st.session_state:
42
+ st.session_state.vector_store = None
43
+
44
+ # Process PDFs and create/update the vector store
45
+ if st.button("Process PDFs") and uploaded_files:
46
+ all_documents = []
47
+
48
+ for file in uploaded_files:
49
+ # Save the file temporarily
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
51
+ temp_file.write(file.getvalue())
52
+ temp_file_path = temp_file.name
53
+
54
+ # Load the PDF using PyPDFLoader
55
+ loader = PyPDFLoader(temp_file_path)
56
+ pdf_docs = loader.load()
57
+
58
+ # Split text into manageable chunks
59
+ text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=1000,
61
+ chunk_overlap=100,
62
+ separators=["\n\n", "\n", " ", ""]
63
+ )
64
+
65
+ for doc in pdf_docs:
66
+ chunks = text_splitter.split_text(doc.page_content)
67
+ for chunk in chunks:
68
+ # Create Document object for each chunk
69
+ all_documents.append(Document(page_content=chunk, metadata=doc.metadata))
70
+
71
+ # Create vector store from documents
72
+ embeddings = OpenAIEmbeddings()
73
+ st.session_state.vector_store = FAISS.from_documents(
74
+ documents=all_documents,
75
+ embedding=embeddings
76
+ )
77
+
78
+ st.success("PDFs processed and vector store created! ✅")
79
+
80
+ # Query + Summarize
81
+ query = st.text_input("Enter your question or summary request:")
82
+
83
+ if st.button("Get Summary/Answer"):
84
+ if st.session_state.vector_store is None:
85
+ st.warning("Please upload and process PDFs first.")
86
+ else:
87
+ # Extract relevant text for summarization
88
+ retriever = st.session_state.vector_store.as_retriever(
89
+ search_type="similarity",
90
+ search_kwargs={"k": 5}
91
+ )
92
+ retrieved_docs = retriever.get_relevant_documents(query)
93
+
94
+ # Combine the content of retrieved documents
95
+ context_text = " ".join([doc.page_content for doc in retrieved_docs])
96
+
97
+ # Generate answer using DeepSeekR1 model
98
+ prompt = f"Context: {context_text}\nQuestion: {query}\nAnswer:"
99
+ result = llm_pipeline(prompt, max_length=300, num_return_sequences=1)
100
+
101
+ st.markdown("### Answer:")
102
+ st.write(result[0]['generated_text'])
103
+
104
+ with st.expander("Show source documents"):
105
+ for i, doc in enumerate(retrieved_docs):
106
+ st.markdown(f"**Source Document {i+1}:**")
107
+ st.write(doc.page_content)
108
+ st.write("---")