Spaces:

hugging2021
/

RAG-System-with-Gemini-2

Sleeping

RAG-System-with-Gemini-2 / utils /processor.py

Upload folder using huggingface_hub

ca637d1 verified about 2 months ago

1.87 kB

	import tempfile
	from datetime import datetime
	from typing import List

	import streamlit as st
	from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	def process_pdf(file) -> List:
	"""Process PDF file and add source metadata."""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(file.getvalue())
	loader = PyPDFLoader(tmp_file.name)
	documents = loader.load()

	# Add source metadata
	for doc in documents:
	doc.metadata.update({
	"source_type": "pdf",
	"file_name": file.name,
	"timestamp": datetime.now().isoformat()
	})

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	return text_splitter.split_documents(documents)

	except Exception as e:
	st.error(f"📄 PDF processing error: {str(e)}")
	return []


	def process_web(url: str) -> List:
	"""Process web URL and add source metadata."""
	try:
	loader = WebBaseLoader(web_path=url)
	documents = loader.load()

	# Add source metadata
	for doc in documents:
	doc.metadata.update({
	"source_type": "url",
	"url": url,
	"timestamp": datetime.now().isoformat()
	})

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	return text_splitter.split_documents(documents)

	except Exception as e:
	st.error(f"🌐 Web processing error: {str(e)}")
	return []