raksama19 commited on
Commit
41cfc53
Β·
verified Β·
1 Parent(s): 97c1d86

Upload pdf_processor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. pdf_processor.py +143 -0
pdf_processor.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import json
3
+ import os
4
+ import re
5
+ from sentence_transformers import SentenceTransformer
6
+ import pickle
7
+
8
+ class PDFProcessor:
9
+ def __init__(self, pdf_directory="/Users/maraksa/Downloads/chatbot/WebAIM/"):
10
+ self.pdf_directory = pdf_directory
11
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
12
+
13
+ # Check if directory exists
14
+ if not os.path.exists(pdf_directory):
15
+ os.makedirs(pdf_directory)
16
+ print(f"Created directory: {pdf_directory}")
17
+ print("Please add your WebAIM PDF files to this directory.")
18
+
19
+ def clean_text(self, text):
20
+ """Clean extracted text from PDF"""
21
+ # Remove extra whitespace and line breaks
22
+ text = re.sub(r'\s+', ' ', text)
23
+
24
+ # Remove common PDF artifacts
25
+ text = re.sub(r'Page \d+ of \d+', '', text)
26
+ text = re.sub(r'WebAIM.*?\n', '', text)
27
+
28
+ return text.strip()
29
+
30
+ def extract_text_from_pdf(self, pdf_path):
31
+ """Extract text from PDF with page information"""
32
+ print(f"Processing: {os.path.basename(pdf_path)}")
33
+ doc = fitz.open(pdf_path)
34
+ pages_content = []
35
+
36
+ for page_num in range(len(doc)):
37
+ page = doc[page_num]
38
+ text = page.get_text()
39
+
40
+ # Clean the text
41
+ cleaned_text = self.clean_text(text)
42
+
43
+ # Skip pages with very little content
44
+ if len(cleaned_text) < 50:
45
+ continue
46
+
47
+ # Clean and chunk text
48
+ chunks = self.chunk_text(cleaned_text, chunk_size=500)
49
+
50
+ for chunk_idx, chunk in enumerate(chunks):
51
+ if len(chunk.strip()) > 30: # Only keep substantial chunks
52
+ pages_content.append({
53
+ 'text': chunk,
54
+ 'source_file': os.path.basename(pdf_path),
55
+ 'page_number': page_num + 1,
56
+ 'chunk_id': chunk_idx,
57
+ 'source_type': 'WebAIM'
58
+ })
59
+
60
+ doc.close()
61
+ print(f"βœ… Extracted {len(pages_content)} chunks from {os.path.basename(pdf_path)}")
62
+ return pages_content
63
+
64
+ def chunk_text(self, text, chunk_size=500, overlap=50):
65
+ """Split text into overlapping chunks"""
66
+ words = text.split()
67
+ chunks = []
68
+
69
+ for i in range(0, len(words), chunk_size - overlap):
70
+ chunk = ' '.join(words[i:i + chunk_size])
71
+ if chunk.strip():
72
+ chunks.append(chunk.strip())
73
+
74
+ return chunks
75
+
76
+ def process_all_pdfs(self):
77
+ """Process all PDFs in the directory"""
78
+ all_content = []
79
+
80
+ # Check if PDFs exist
81
+ pdf_files = [f for f in os.listdir(self.pdf_directory) if f.endswith('.pdf')]
82
+
83
+ if not pdf_files:
84
+ print(f"❌ No PDF files found in {self.pdf_directory}")
85
+ print("Please add your WebAIM PDF files to the pdfs/ directory")
86
+ return []
87
+
88
+ print(f"Found {len(pdf_files)} PDF files:")
89
+ for pdf_file in pdf_files:
90
+ print(f" - {pdf_file}")
91
+
92
+ for filename in pdf_files:
93
+ pdf_path = os.path.join(self.pdf_directory, filename)
94
+ try:
95
+ content = self.extract_text_from_pdf(pdf_path)
96
+ all_content.extend(content)
97
+ except Exception as e:
98
+ print(f"❌ Error processing {filename}: {str(e)}")
99
+
100
+ return all_content
101
+
102
+ def create_knowledge_base(self, output_path="knowledge_base.json"):
103
+ """Create searchable knowledge base from PDFs"""
104
+ print("πŸš€ Starting PDF processing...")
105
+ all_content = self.process_all_pdfs()
106
+
107
+ if not all_content:
108
+ print("❌ No content extracted. Please check your PDF files.")
109
+ return None
110
+
111
+ print(f"πŸ“„ Total chunks extracted: {len(all_content)}")
112
+ print("🧠 Creating embeddings... (this may take a few minutes)")
113
+
114
+ texts = [item['text'] for item in all_content]
115
+ embeddings = self.embedder.encode(texts, show_progress_bar=True)
116
+
117
+ # Save knowledge base
118
+ knowledge_base = {
119
+ 'content': all_content,
120
+ 'embeddings': embeddings.tolist(),
121
+ 'metadata': {
122
+ 'total_chunks': len(all_content),
123
+ 'embedding_model': 'all-MiniLM-L6-v2',
124
+ 'chunk_size': 500,
125
+ 'overlap': 50
126
+ }
127
+ }
128
+
129
+ with open(output_path, 'w') as f:
130
+ json.dump(knowledge_base, f, indent=2)
131
+
132
+ print(f"βœ… Knowledge base saved to {output_path}")
133
+ print(f"πŸ“Š Summary:")
134
+ print(f" - Total chunks: {len(all_content)}")
135
+ print(f" - Embedding dimensions: {len(embeddings[0])}")
136
+ print(f" - File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
137
+
138
+ return knowledge_base
139
+
140
+ # Usage
141
+ if __name__ == "__main__":
142
+ processor = PDFProcessor()
143
+ knowledge_base = processor.create_knowledge_base()