Spaces:

AnuragShirke
/

knowledge-assistant-backend

Sleeping

knowledge-assistant-backend / src /core /processing_gemini.py

Final Commit for Hosting Deployment

4cab845 6 months ago

1.71 kB

	import fitz # PyMuPDF
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import docx # Added for docx parsing
	import os

	def parse_pdf(file_path: str) -> str:
	"""Extracts text from a PDF file."""
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	doc.close()
	return text

	def parse_txt(file_path: str) -> str:
	"""Extracts text from a TXT file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	return text

	def parse_docx(file_path: str) -> str:
	"""Extracts text from a DOCX file."""
	doc = docx.Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	def parse_document(file_path: str, file_type: str) -> str:
	"""Parse document based on file type."""
	if file_type == "pdf":
	return parse_pdf(file_path)
	elif file_type == "txt":
	return parse_txt(file_path)
	elif file_type == "docx":
	return parse_docx(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_type}")

	def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
	"""Split text into chunks using LangChain's text splitter."""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_embedding_model():
	"""Return None since we're using Gemini API for embeddings."""
	# This function is kept for compatibility but returns None
	# since we're using Gemini API for embeddings
	return None