Dinesh310 commited on
Commit
1b7129e
·
verified ·
1 Parent(s): afa5c1b

Update src/document_ingestion/document_processor.py

Browse files
src/document_ingestion/document_processor.py CHANGED
@@ -1,104 +1,103 @@
1
- """Document processing module for loading and splitting documents"""
2
-
3
- from typing import List
4
- from langchain_community.document_loaders import WebBaseLoader
5
- from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- from langchain.schema import Document
7
-
8
- from typing import List, Union
9
- from pathlib import Path
10
- from langchain_community.document_loaders import (
11
- WebBaseLoader,
12
- PyPDFLoader,
13
- TextLoader,
14
- PyPDFDirectoryLoader
15
- )
16
-
17
- class DocumentProcessor:
18
- """Handles document loading and processing"""
19
-
20
- def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
21
- """
22
- Initialize document processor
23
-
24
- Args:
25
- chunk_size: Size of text chunks
26
- chunk_overlap: Overlap between chunks
27
- """
28
- self.chunk_size = chunk_size
29
- self.chunk_overlap = chunk_overlap
30
- self.splitter = RecursiveCharacterTextSplitter(
31
- chunk_size=chunk_size,
32
- chunk_overlap=chunk_overlap
33
- )
34
- def load_from_url(self, url: str) -> List[Document]:
35
- """Load document(s) from a URL"""
36
- loader = WebBaseLoader(url)
37
- return loader.load()
38
-
39
- def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
40
- """Load documents from all PDFs inside a directory"""
41
- loader = PyPDFDirectoryLoader(str(directory))
42
- return loader.load()
43
-
44
- def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
45
- """Load document(s) from a TXT file"""
46
- loader = TextLoader(str(file_path), encoding="utf-8")
47
- return loader.load()
48
-
49
- def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
50
- """Load document(s) from a PDF file"""
51
- loader = PyPDFDirectoryLoader(str("data"))
52
- return loader.load()
53
-
54
- def load_documents(self, sources: List[str]) -> List[Document]:
55
- """
56
- Load documents from URLs, PDF directories, or TXT files
57
-
58
- Args:
59
- sources: List of URLs, PDF folder paths, or TXT file paths
60
-
61
- Returns:
62
- List of loaded documents
63
- """
64
- docs: List[Document] = []
65
- for src in sources:
66
- if src.startswith("http://") or src.startswith("https://"):
67
- docs.extend(self.load_from_url(src))
68
-
69
- path = Path("data")
70
- if path.is_dir(): # PDF directory
71
- docs.extend(self.load_from_pdf_dir(path))
72
- elif path.suffix.lower() == ".txt":
73
- docs.extend(self.load_from_txt(path))
74
- else:
75
- raise ValueError(
76
- f"Unsupported source type: {src}. "
77
- "Use URL, .txt file, or PDF directory."
78
- )
79
- return docs
80
-
81
- def split_documents(self, documents: List[Document]) -> List[Document]:
82
- """
83
- Split documents into chunks
84
-
85
- Args:
86
- documents: List of documents to split
87
-
88
- Returns:
89
- List of split documents
90
- """
91
- return self.splitter.split_documents(documents)
92
-
93
- def process_urls(self, urls: List[str]) -> List[Document]:
94
- """
95
- Complete pipeline to load and split documents
96
-
97
- Args:
98
- urls: List of URLs to process
99
-
100
- Returns:
101
- List of processed document chunks
102
- """
103
- docs = self.load_documents(urls)
104
  return self.split_documents(docs)
 
1
+ """Document processing module for loading and splitting documents"""
2
+
3
+ from typing import List, Union
4
+ from langchain_community.document_loaders import WebBaseLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ # from langchain.schema import Document
7
+ from langchain_core.documents import Document
8
+ from pathlib import Path
9
+ from langchain_community.document_loaders import (
10
+ WebBaseLoader,
11
+ PyPDFLoader,
12
+ TextLoader,
13
+ PyPDFDirectoryLoader
14
+ )
15
+
16
+ class DocumentProcessor:
17
+ """Handles document loading and processing"""
18
+
19
+ def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
20
+ """
21
+ Initialize document processor
22
+
23
+ Args:
24
+ chunk_size: Size of text chunks
25
+ chunk_overlap: Overlap between chunks
26
+ """
27
+ self.chunk_size = chunk_size
28
+ self.chunk_overlap = chunk_overlap
29
+ self.splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap
32
+ )
33
+ def load_from_url(self, url: str) -> List[Document]:
34
+ """Load document(s) from a URL"""
35
+ loader = WebBaseLoader(url)
36
+ return loader.load()
37
+
38
+ def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
39
+ """Load documents from all PDFs inside a directory"""
40
+ loader = PyPDFDirectoryLoader(str(directory))
41
+ return loader.load()
42
+
43
+ def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
44
+ """Load document(s) from a TXT file"""
45
+ loader = TextLoader(str(file_path), encoding="utf-8")
46
+ return loader.load()
47
+
48
+ def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
49
+ """Load document(s) from a PDF file"""
50
+ loader = PyPDFDirectoryLoader(str("data"))
51
+ return loader.load()
52
+
53
+ def load_documents(self, sources: List[str]) -> List[Document]:
54
+ """
55
+ Load documents from URLs, PDF directories, or TXT files
56
+
57
+ Args:
58
+ sources: List of URLs, PDF folder paths, or TXT file paths
59
+
60
+ Returns:
61
+ List of loaded documents
62
+ """
63
+ docs: List[Document] = []
64
+ for src in sources:
65
+ if src.startswith("http://") or src.startswith("https://"):
66
+ docs.extend(self.load_from_url(src))
67
+
68
+ path = Path("data")
69
+ if path.is_dir(): # PDF directory
70
+ docs.extend(self.load_from_pdf_dir(path))
71
+ elif path.suffix.lower() == ".txt":
72
+ docs.extend(self.load_from_txt(path))
73
+ else:
74
+ raise ValueError(
75
+ f"Unsupported source type: {src}. "
76
+ "Use URL, .txt file, or PDF directory."
77
+ )
78
+ return docs
79
+
80
+ def split_documents(self, documents: List[Document]) -> List[Document]:
81
+ """
82
+ Split documents into chunks
83
+
84
+ Args:
85
+ documents: List of documents to split
86
+
87
+ Returns:
88
+ List of split documents
89
+ """
90
+ return self.splitter.split_documents(documents)
91
+
92
+ def process_urls(self, urls: List[str]) -> List[Document]:
93
+ """
94
+ Complete pipeline to load and split documents
95
+
96
+ Args:
97
+ urls: List of URLs to process
98
+
99
+ Returns:
100
+ List of processed document chunks
101
+ """
102
+ docs = self.load_documents(urls)
 
103
  return self.split_documents(docs)