msaifee commited on
Commit
e1dd2c4
·
verified ·
1 Parent(s): 6769473

saving file on temporary location for embeddings

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -50,26 +50,28 @@ def get_huggingface_pipeline():
50
  if st.button("Process PDFs") and uploaded_files:
51
  all_documents = []
52
 
53
- for file in uploaded_files:
54
- loader = PyPDFLoader(BytesIO(file.getvalue()))
 
 
 
 
 
 
55
  pdf_docs = loader.load()
56
 
 
57
  text_splitter = RecursiveCharacterTextSplitter(
58
  chunk_size=1000,
59
- chunk_overlap=100,
60
  separators=["\n\n", "\n", " ", ""]
61
  )
62
 
63
- docs = []
64
  for doc in pdf_docs:
65
  chunks = text_splitter.split_text(doc.page_content)
66
  for chunk in chunks:
67
- docs.append({
68
- "page_content": chunk,
69
- "metadata": doc.metadata
70
- })
71
-
72
- all_documents.extend(docs)
73
 
74
  # Create embeddings with Hugging Face
75
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
50
  if st.button("Process PDFs") and uploaded_files:
51
  all_documents = []
52
 
53
+ for file in uploaded_files:
54
+ # Save the file temporarily
55
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
56
+ temp_file.write(file.getvalue())
57
+ temp_file_path = temp_file.name
58
+
59
+ # Load the PDF using PyPDFLoader
60
+ loader = PyPDFLoader(temp_file_path)
61
  pdf_docs = loader.load()
62
 
63
+ # Split text into manageable chunks
64
  text_splitter = RecursiveCharacterTextSplitter(
65
  chunk_size=1000,
66
+ chunk_overlap=300,
67
  separators=["\n\n", "\n", " ", ""]
68
  )
69
 
 
70
  for doc in pdf_docs:
71
  chunks = text_splitter.split_text(doc.page_content)
72
  for chunk in chunks:
73
+ # Create Document object for each chunk
74
+ all_documents.append(Document(page_content=chunk, metadata=doc.metadata))
 
 
 
 
75
 
76
  # Create embeddings with Hugging Face
77
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")