Spaces:

msaifee
/

Research-Paper-Summerizer

Build error

msaifee commited on Feb 7, 2025

Commit

e1dd2c4

verified ·

1 Parent(s): 6769473

saving file on temporary location for embeddings

Files changed (1) hide show

app.py CHANGED Viewed

@@ -50,26 +50,28 @@ def get_huggingface_pipeline():
 if st.button("Process PDFs") and uploaded_files:
     all_documents = []
-    for file in uploaded_files:
-        loader = PyPDFLoader(BytesIO(file.getvalue()))
         pdf_docs = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
-            chunk_overlap=100,
             separators=["\n\n", "\n", " ", ""]
         )
-        docs = []
         for doc in pdf_docs:
             chunks = text_splitter.split_text(doc.page_content)
             for chunk in chunks:
-                docs.append({
-                    "page_content": chunk,
-                    "metadata": doc.metadata
-                })
-        all_documents.extend(docs)
     # Create embeddings with Hugging Face
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

 if st.button("Process PDFs") and uploaded_files:
     all_documents = []
+   for file in uploaded_files:
+        # Save the file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(file.getvalue())
+            temp_file_path = temp_file.name
+        # Load the PDF using PyPDFLoader
+        loader = PyPDFLoader(temp_file_path)
         pdf_docs = loader.load()
+        # Split text into manageable chunks
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
+            chunk_overlap=300,
             separators=["\n\n", "\n", " ", ""]
         )
         for doc in pdf_docs:
             chunks = text_splitter.split_text(doc.page_content)
             for chunk in chunks:
+                # Create Document object for each chunk
+                all_documents.append(Document(page_content=chunk, metadata=doc.metadata))
     # Create embeddings with Hugging Face
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")