Spaces:

yekkala
/

Homoeopathy-Bot

Sleeping

App Files Files Community

Homoeopathy-Bot / utils /document_processor.py

yekkala

Create document_processor.py

099c54b verified 4 months ago

raw

history blame contribute delete

3.02 kB

	"""
	Process uploaded documents for knowledge base
	"""
	import os
	import PyPDF2
	import docx
	from typing import Dict, List
	import hashlib

	class DocumentProcessor:
	def __init__(self, upload_dir: str = "knowledge_base/internal_docs"):
	self.upload_dir = upload_dir
	os.makedirs(upload_dir, exist_ok=True)

	def process_uploaded_file(self, uploaded_file) -> Dict:
	"""Process uploaded document and extract text"""
	file_hash = hashlib.md5(uploaded_file.getvalue()).hexdigest()
	file_path = os.path.join(self.upload_dir, f"{file_hash}_{uploaded_file.name}")

	# Save the file
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Extract text based on file type
	text = ""
	if uploaded_file.name.endswith('.pdf'):
	text = self._extract_pdf_text(file_path)
	elif uploaded_file.name.endswith('.docx'):
	text = self._extract_docx_text(file_path)
	elif uploaded_file.name.endswith('.txt'):
	text = uploaded_file.getvalue().decode('utf-8')

	return {
	'filename': uploaded_file.name,
	'file_hash': file_hash,
	'file_path': file_path,
	'text': text[:5000], # Limit text for processing
	'word_count': len(text.split()),
	'status': 'processed'
	}

	def _extract_pdf_text(self, file_path: str) -> str:
	"""Extract text from PDF file"""
	text = ""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	except Exception as e:
	print(f"Error reading PDF: {e}")
	return text

	def _extract_docx_text(self, file_path: str) -> str:
	"""Extract text from DOCX file"""
	text = ""
	try:
	doc = docx.Document(file_path)
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	except Exception as e:
	print(f"Error reading DOCX: {e}")
	return text

	def search_in_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
	"""Search for query in processed documents"""
	results = []
	query_lower = query.lower()

	for doc in documents:
	if query_lower in doc['text'].lower():
	# Find context around the match
	idx = doc['text'].lower().find(query_lower)
	start = max(0, idx - 100)
	end = min(len(doc['text']), idx + len(query) + 100)
	context = doc['text'][start:end]

	results.append({
	'document': doc['filename'],
	'match': query,
	'context': f"...{context}...",
	'relevance': 1.0
	})

	return results