Spaces:
Sleeping
Sleeping
File size: 3,023 Bytes
099c54b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | """
Process uploaded documents for knowledge base
"""
import os
import PyPDF2
import docx
from typing import Dict, List
import hashlib
class DocumentProcessor:
def __init__(self, upload_dir: str = "knowledge_base/internal_docs"):
self.upload_dir = upload_dir
os.makedirs(upload_dir, exist_ok=True)
def process_uploaded_file(self, uploaded_file) -> Dict:
"""Process uploaded document and extract text"""
file_hash = hashlib.md5(uploaded_file.getvalue()).hexdigest()
file_path = os.path.join(self.upload_dir, f"{file_hash}_{uploaded_file.name}")
# Save the file
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Extract text based on file type
text = ""
if uploaded_file.name.endswith('.pdf'):
text = self._extract_pdf_text(file_path)
elif uploaded_file.name.endswith('.docx'):
text = self._extract_docx_text(file_path)
elif uploaded_file.name.endswith('.txt'):
text = uploaded_file.getvalue().decode('utf-8')
return {
'filename': uploaded_file.name,
'file_hash': file_hash,
'file_path': file_path,
'text': text[:5000], # Limit text for processing
'word_count': len(text.split()),
'status': 'processed'
}
def _extract_pdf_text(self, file_path: str) -> str:
"""Extract text from PDF file"""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
except Exception as e:
print(f"Error reading PDF: {e}")
return text
def _extract_docx_text(self, file_path: str) -> str:
"""Extract text from DOCX file"""
text = ""
try:
doc = docx.Document(file_path)
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
except Exception as e:
print(f"Error reading DOCX: {e}")
return text
def search_in_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
"""Search for query in processed documents"""
results = []
query_lower = query.lower()
for doc in documents:
if query_lower in doc['text'].lower():
# Find context around the match
idx = doc['text'].lower().find(query_lower)
start = max(0, idx - 100)
end = min(len(doc['text']), idx + len(query) + 100)
context = doc['text'][start:end]
results.append({
'document': doc['filename'],
'match': query,
'context': f"...{context}...",
'relevance': 1.0
})
return results |