# ============================================================ # PHASE 1 DATA PREPROCESSING (FINAL UPDATED VERSION - PRO READY) # FILE: models/phase1_data_preprocessing.py # # Supports: # - IEEE + Non-standard journals # - PDF -> text extraction (PyMuPDF) # - SAFE cleaning (preserve tables + numbering) # - Metadata Extraction: # Title, Authors, Affiliation, DOI, Year, Abstract, Keywords # - References extraction # - IMRAD split (heading-based + fallback) # # IMPORTANT FIX: # - DO NOT destroy table structures # - Preserve line breaks # - Preserve numeric units and symbols (% , | , : , -) # # OUTPUT FORMAT (STRICT COMPATIBLE WITH PHASE 2): # { # "paper_id": "...", # "title": "...", # "keywords": [...], # "abstract": "...", # "cleaned_text": "...", # "imrad_sections": { # "introduction": "...", # "methodology": "...", # "results": "...", # "conclusion": "..." # }, # "references": "...", # "metadata": {...} # } # ============================================================ import re import os from datetime import datetime # Safe import fitz try: import fitz # PyMuPDF except ImportError: raise ImportError("❌ PyMuPDF not installed. Run: pip install pymupdf") # ========================================================== # SAFE STRING # ========================================================== def safe_str(value): if value is None: return "" return str(value).strip() def clean_text(text: str) -> str: text = safe_str(text) text = text.replace("\u00a0", " ") text = re.sub(r"\s+", " ", text).strip() return text # ============================================================ # 1) PDF TEXT EXTRACTION (COLUMN-AWARE) # ============================================================ def extract_text_from_pdf(pdf_path): if not os.path.exists(pdf_path): raise FileNotFoundError(f"❌ pdf file not found: {pdf_path}") doc = fitz.open(pdf_path) full_text = [] for page in doc: # get page width to calculate the middle divider width = page.rect.width mid_x = width / 2.0 # extract text as layout blocks instead of raw text blocks = page.get_text("blocks") # filter out images/drawings (block_type == 0 is text) text_blocks = [b for b in blocks if b[6] == 0] # separate blocks into header/full-width, left column, and right column full_width = [] left_col = [] right_col = [] for b in text_blocks: x0, y0, x1, y1, text, block_no, block_type = b block_width = x1 - x0 # if the block takes up more than 80% of the page, it's a title/header if block_width > width * 0.8: full_width.append(b) # if the block starts on the left half elif x0 < mid_x: left_col.append(b) # if the block starts on the right half else: right_col.append(b) # sort everything top-to-bottom (y0 coordinate) full_width.sort(key=lambda b: b[1]) left_col.sort(key=lambda b: b[1]) right_col.sort(key=lambda b: b[1]) # assemble the page: headers first, then left column, then right column sorted_blocks = full_width + left_col + right_col for b in sorted_blocks: text = b[4] # clean out weird hidden characters text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', ' ', text) full_text.append(text.strip()) doc.close() return "\n\n".join(full_text).strip() # ============================================================ # 2) REMOVE IEEE FOOTER / LICENSE NOISE (SAFE) # IMPORTANT: DOI MUST BE PRESERVED # ============================================================ def remove_ieee_noise(text: str): if not text: return "" patterns = [ # -------------------------------------- # IEEE LICENSE # -------------------------------------- r"authorized licensed use.*?restrictions apply\.?", r"downloaded on.*?from ieee xplore\.?", r"personal use is permitted.*?permission\.?", # -------------------------------------- # IEEE ACCESS FOOTER # -------------------------------------- r"©\s*\d{4}\s*ieee", r"ieee xplore", r"\$\d+\.\d+", r"\bvol\.\s*\d+", r"\bno\.\s*\d+", r"\bpp\.\s*\d+\s*-\s*\d+", # -------------------------------------- # RECEIVED / ACCEPTED BLOCK # -------------------------------------- r"received\s+\d{1,2}\s+\w+\s+\d{4}.*?", r"accepted\s+\d{1,2}\s+\w+\s+\d{4}.*?", r"date\s+of\s+publication\s+\d{1,2}\s+\w+\s+\d{4}.*?", r"date\s+of\s+current\s+version\s+\d{1,2}\s+\w+\s+\d{4}.*?", # -------------------------------------- # ASSOCIATE EDITOR # -------------------------------------- r"the associate editor coordinating the review.*?publication.*?", # -------------------------------------- # CREATIVE COMMONS LICENSE # -------------------------------------- r"this work is licensed under a creative commons.*?", r"for more information,\s*see\s*https?://[^\s]+", # -------------------------------------- # PAGE FOOTER # -------------------------------------- r"volume\s+\d+,\s*\d{4}", r"vol\.\s*\d+,\s*\d{4}", # -------------------------------------- # REMOVE REPEATED IEEE ACCESS HEADER # -------------------------------------- r"w\.\s*han\s*et\s*al\.\s*:.*?classifier", ] for pattern in patterns: text = re.sub( pattern, "", text, flags=re.IGNORECASE ) # -------------------------------------- # REMOVE EMPTY LINES # -------------------------------------- text = re.sub( r"\n{4,}", "\n\n", text ) return text.strip() # ============================================================ # 3) SAFE CLEANING (PRESERVE TABLES + BULLETS) # ============================================================ def clean_extracted_text(raw_text): if not raw_text: return "" text = raw_text.replace("\u00a0", " ").replace("\t", " ") text = remove_ieee_noise(text) # remove page numbers standing alone text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE) # fix broken hyphenated words across lines (e.g., classifi- \n cation) text = re.sub(r"([a-zA-Z])-\s*\n\s*([a-zA-Z])", r"\1\2", text) # smart paragraph stitching: # if a line ends with a lowercase letter or comma, it is probably mid-sentence. # replace that specific newline with a space to stitch the sentence back together. text = re.sub(r"([a-z,])\n([a-zA-Z])", r"\1 \2", text) # common pdf extraction typos replacements = { "face-toface": "face-to-face", "IoTbased": "IoT-based", "pre- processing": "preprocessing", "machinelearning": "machine learning" } for old, new in replacements.items(): text = text.replace(old, new) # clean up excess whitespace but preserve double newlines for sections text = re.sub(r" {2,}", " ", text) text = re.sub(r"\n{4,}", "\n\n", text) return text.strip() # ============================================================ # 4) DOI EXTRACTION # ============================================================ def extract_doi(cleaned_text): cleaned_text = cleaned_text or "" doi_patterns = [ # Standard DOI r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b", # DOI: r"\bdoi\s*:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b", # Digital Object Identifier r"\bdigital\s+object\s+identifier\s+(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b" ] for pattern in doi_patterns: m = re.search( pattern, cleaned_text, flags=re.IGNORECASE ) if not m: continue if m.lastindex: doi = m.group(1) else: doi = m.group(0) doi = doi.strip() doi = re.sub( r"^(doi\s*:?)", "", doi, flags=re.IGNORECASE ) return doi return "" # ============================================================ # 5) YEAR EXTRACTION # ============================================================ def extract_year(cleaned_text): cleaned_text = cleaned_text or "" current_year = datetime.now().year # ---------------------------------------- # PRIORITY 1 # RECEIVED / ACCEPTED / PUBLICATION YEAR # ---------------------------------------- publication_patterns = [ r"date of publication\s+\w+\s+\d{1,2},?\s+(20\d{2})", r"accepted\s+\d{1,2}\s+\w+\s+(20\d{2})", r"received\s+\d{1,2}\s+\w+\s+(20\d{2})", r"current version\s+\d{1,2}\s+\w+\s+(20\d{2})" ] for pattern in publication_patterns: m = re.search( pattern, cleaned_text, flags=re.IGNORECASE ) if m: year = int(m.group(1)) if 1990 <= year <= current_year + 1: return str(year) # ---------------------------------------- # PRIORITY 2 # FIRST 3000 CHARS ONLY # ---------------------------------------- head_text = cleaned_text[:3000] years = re.findall( r"\b(19\d{2}|20\d{2})\b", head_text ) if years: valid_years = [ int(y) for y in years if 1990 <= int(y) <= current_year + 1 ] if valid_years: return str(max(valid_years)) # ---------------------------------------- # PRIORITY 3 # FULL DOCUMENT FALLBACK # ---------------------------------------- years = re.findall( r"\b(19\d{2}|20\d{2})\b", cleaned_text ) if years: valid_years = [ int(y) for y in years if 1990 <= int(y) <= current_year + 1 ] if valid_years: return str(max(valid_years)) return "" # ============================================================ # 6) TITLE EXTRACTION # ============================================================ def extract_title(cleaned_text): lines = [ l.strip() for l in cleaned_text.split("\n") if l.strip() ] if not lines: return "untitled paper" best_title = "" best_score = 0 for line in lines[:120]: low = line.lower() score = 0 # aggressive reject for headers and journal metadata reject_words = [ "abstract", "keywords", "index terms", "references", "received", "accepted", "date of publication", "date of current version", "digital object identifier", "doi", "volume", "issue", "@", "ieee", "transactions", "journal", "proceedings", "conference", "vol.", "no.", "pp.", "issn", "copyright" ] if any(x in low for x in reject_words): continue if re.search(r"\b(university|faculty|department|school|college|institute)\b", low): continue words = len(line.split()) # titles are usually between 4 and 25 words if 4 <= words <= 25: score += 5 if len(line) <= 180: score += 2 # title usually contains capitals caps = sum(1 for c in line if c.isupper()) score += min(caps, 5) # boost score if it doesn't have 4-digit numbers (headers often have years like 2025) if not re.search(r"\b\d{4}\b", line): score += 3 if score > best_score: best_score = score best_title = line final_title = best_title if best_title else lines[0] # check for weird encoding gibberish weird_chars = len(re.findall(r"[^a-zA-Z0-9\s:,\.\-]", final_title)) if weird_chars > len(final_title) * 0.2: return "title extraction error (please enter manually)" return final_title # ============================================================ # 7) ABSTRACT EXTRACTION (IEEE ROBUST VERSION) # ============================================================ def extract_abstract(cleaned_text): if not cleaned_text: return "" text = safe_str(cleaned_text) # catch standard, capitalized, and spaced-out versions patterns = [ r"\bA\s*B\s*S\s*T\s*R\s*A\s*C\s*T\b\s*[—\-:\.]?\s*", r"\bAbstract\b\s*[—\-:\.]?\s*" ] start_pos = -1 for p in patterns: m = re.search(p, text, flags=re.IGNORECASE) if m: start_pos = m.end() break # smart fallback: if no "abstract" keyword, grab the block before the introduction if start_pos == -1: intro_match = re.search(r"\b(1\.|I\.)?\s*INTRODUCTION\b", text, flags=re.IGNORECASE) if intro_match: potential_abstract = text[:intro_match.start()] # find the last chunky paragraph before intro paragraphs = potential_abstract.split("\n\n") for para in reversed(paragraphs): if len(para.split()) > 40: # abstracts usually have more than 40 words return re.sub(r"\s+", " ", para).strip() return "" tail = text[start_pos:] # strict stop markers so it doesn't bleed into the main body stop_markers = [ r"\bKeywords\b", r"\bIndex Terms\b", r"\bI\.\s*INTRODUCTION\b", r"\b1\.\s*INTRODUCTION\b", r"\n\s*INTRODUCTION\b" ] stop_pos = len(tail) for marker in stop_markers: mm = re.search(marker, tail, flags=re.IGNORECASE) if mm: stop_pos = min(stop_pos, mm.start()) abstract = tail[:stop_pos] abstract = re.sub(r"\s+", " ", abstract).strip() # remove leftover metadata noise abstract = re.sub(r"The associate editor.*?publication.*?\.", "", abstract, flags=re.IGNORECASE) return abstract[:2500].strip() # ============================================================ # 8) AUTHORS + AFFILIATION EXTRACTION (IEEE + GENERAL HEURISTIC) # ============================================================ def extract_authors_affiliation(cleaned_text, paper_title=""): lines = [ l.strip() for l in cleaned_text.split("\n") if l.strip() ] if not lines: return "author information not found", "affiliation data not found" authors = "" affiliation = "" affiliation_keywords = [ "university", "faculty", "department", "school", "college", "institute", "research center", "centre", "laboratory", "lab", "malaysia", "campus" ] reject_keywords = [ "abstract", "keywords", "index terms", "received", "accepted", "date of publication", "date of current version", "digital object identifier", "doi", "copyright", "volume", "issue", "ieee" ] head_lines = lines[:120] # ========================== # AFFILIATION # ========================== for line in head_lines: low = line.lower() if any(k in low for k in affiliation_keywords): if len(line) < 250: affiliation = line break # ========================== # AUTHORS # ========================== for line in head_lines: low = line.lower() # ignore the line if it is exactly the paper title! if paper_title and line.lower() == paper_title.lower(): continue if "abstract" in low: break if any(k in low for k in reject_keywords): continue if "@" in line: continue if any(k in low for k in affiliation_keywords): continue # skip section heading if re.match(r"^[IVX]{1,6}\.", line) or re.match(r"^\d+\.", line): continue # author line usually contains names capital_words = len(re.findall(r"\b[A-Z][a-z]+\b", line)) initials = len(re.findall(r"\b[A-Z]\.", line)) score = capital_words + (initials * 2) if "," in line: score += 3 if 2 <= len(line.split()) <= 20: score += 2 if score >= 6: authors = line break authors = re.sub(r"[^a-zA-Z0-9,\.\-\s]", "", authors).strip() affiliation = re.sub(r"[^a-zA-Z0-9,\.\-\s\(\)]", "", affiliation).strip() if not authors or len(authors) < 3: authors = "author information not found" if not affiliation or len(affiliation) < 3: affiliation = "affiliation data not found" return authors, affiliation # ============================================================ # 9) KEYWORDS EXTRACTION (ROBUST IEEE VERSION) # ============================================================ def extract_keywords(cleaned_text): text = safe_str(cleaned_text) if not text: return [] keywords = [] patterns = [ r"\bKeywords\s*[:\-]?\s*(.+)", r"\bIndex Terms\s*[:\-]?\s*(.+)", r"\bKeywords\s*[—–-]\s*(.+)", r"\bIndex Terms\s*[—–-]\s*(.+)" ] kw_block = "" for pattern in patterns: m = re.search( pattern, text, flags=re.IGNORECASE ) if m: start = m.start() tail = text[start:start + 1200] stop_patterns = [ r"\bI\.\s*INTRODUCTION\b", r"\b1\.\s*INTRODUCTION\b", r"\bINTRODUCTION\b", r"\bABSTRACT\b", r"\bREFERENCES\b", r"\bReceived\b", r"\bAccepted\b", r"\bDigital Object Identifier\b" ] stop_pos = len(tail) for sp in stop_patterns: mm = re.search( sp, tail, flags=re.IGNORECASE ) if mm: stop_pos = min( stop_pos, mm.start() ) kw_block = tail[:stop_pos] kw_block = re.sub( r"^(Keywords|Index Terms)\s*[:\-–—]?\s*", "", kw_block, flags=re.IGNORECASE ) break if not kw_block: return [] kw_block = kw_block.replace("\n", " ") kw_block = re.sub( r"\s+", " ", kw_block ) raw_keywords = re.split( r",|;", kw_block ) for kw in raw_keywords: kw = clean_text(kw) kw = re.sub( r"[^A-Za-z0-9\-\s\(\)]", "", kw ).strip() if len(kw) < 3: continue if len(kw) > 60: continue keywords.append(kw) keywords = list(dict.fromkeys(keywords)) return keywords[:12] # ============================================================ # 10) REFERENCES EXTRACTION # ============================================================ def extract_references(cleaned_text): cleaned_text = cleaned_text or "" ref_match = re.search( r"^\s*REFERENCES\b", cleaned_text, flags=re.IGNORECASE | re.MULTILINE ) if not ref_match: return "" references_text = cleaned_text[ ref_match.end(): ].strip() # ---------------------------------------- # STOP AFTER REFERENCES SECTION # ---------------------------------------- stop_patterns = [ r"^\s*APPENDIX\b", r"^\s*ACKNOWLEDGMENT\b", r"^\s*ACKNOWLEDGEMENTS\b", r"^\s*AUTHOR BIOGRAPHY\b", r"^\s*AUTHOR BIOGRAPHIES\b", r"^\s*BIOGRAPHY\b", r"^\s*BIOGRAPHIES\b", r"^\s*ABOUT THE AUTHORS\b" ] stop_pos = len(references_text) for pattern in stop_patterns: m = re.search( pattern, references_text, flags=re.IGNORECASE | re.MULTILINE ) if m: stop_pos = min( stop_pos, m.start() ) references_text = references_text[:stop_pos] # ---------------------------------------- # CLEAN # ---------------------------------------- references_text = remove_ieee_noise( references_text ) references_text = re.sub( r"\n{4,}", "\n\n", references_text ) references_text = references_text.strip() # ---------------------------------------- # LIMIT SIZE # ---------------------------------------- if len(references_text) > 15000: references_text = references_text[:15000] return references_text # ============================================================ # 11) REMOVE KEYWORDS + REFERENCES FROM MAIN BODY # ============================================================ def remove_keywords_and_references(cleaned_text): text = cleaned_text # remove keyword block (multi-line safe) text = re.sub( r"^\s*(Keywords|Index Terms)\s*[:\-]?.*?(\n\s*\n)", "\n\n", text, flags=re.IGNORECASE | re.DOTALL ) # cut before REFERENCES text = re.split(r"^\s*REFERENCES\b", text, flags=re.IGNORECASE | re.MULTILINE)[0] # remove extra blank lines text = re.sub(r"\n{4,}", "\n\n\n", text).strip() return text # ============================================================ # 12) DETECT SECTION HEADINGS # IEEE + GENERAL JOURNAL SUPPORT # ============================================================ def detect_section_headings(text): text = safe_str(text) headings = [] seen_positions = set() patterns = [ # III. RESULTS re.compile( r"^\s*([IVX]{1,8})\.\s+(.+?)\s*$", re.MULTILINE ), # 3 RESULTS re.compile( r"^\s*(\d{1,2})\.?\s+([A-Za-z].+?)\s*$", re.MULTILINE ), # A. Experimental Results re.compile( r"^\s*([A-Z])\.\s+(.+?)\s*$", re.MULTILINE ), # RESULTS AND DISCUSSION re.compile( r"^\s*([A-Z][A-Z0-9 \-\(\)/]{4,})\s*$", re.MULTILINE ) ] for pat in patterns: for m in pat.finditer(text): start = m.start() if start in seen_positions: continue seen_positions.add(start) if m.lastindex >= 2: label = m.group(1).strip() title = m.group(2).strip() else: label = "" title = m.group(1).strip() title = re.sub( r"\s{2,}", " ", title ).strip() low = title.lower() # ------------------------------------------------ # FILTER GARBAGE # ------------------------------------------------ if len(title) < 4: continue if len(title) > 120: continue if low.startswith("table"): continue if low.startswith("fig"): continue if low.startswith("figure"): continue if low.startswith("volume"): continue if low.startswith("received"): continue if low.startswith("accepted"): continue if "copyright" in low: continue if "creative commons" in low: continue if "digital object identifier" in low: continue if re.match( r"^w\.\s*[a-z]", low ): continue headings.append({ "label": label, "title": title, "start": start }) # -------------------------------------------------------- # SORT # -------------------------------------------------------- headings = sorted( headings, key=lambda x: x["start"] ) # -------------------------------------------------------- # REMOVE DUPLICATES # -------------------------------------------------------- cleaned = [] used_titles = set() for h in headings: title_key = ( h["title"] .lower() .strip() ) if title_key in used_titles: continue used_titles.add(title_key) cleaned.append(h) return cleaned # ============================================================ # 13) MAP HEADING INTO IMRAD CATEGORY # ============================================================ def map_heading_to_imrad(heading_title): t = safe_str(heading_title).lower().strip() # -------------------------------------------------------- # INTRODUCTION # -------------------------------------------------------- if any(k in t for k in [ "introduction", "background", "motivation", "overview", "preliminaries", "related work", "literature review", "state of the art", "problem statement", "research gap" ]): return "introduction" # -------------------------------------------------------- # METHODOLOGY # -------------------------------------------------------- if any(k in t for k in [ "method", "methodology", "materials", "implementation", "framework", "architecture", "design", "approach", "system model", "proposed system", "proposed method", "proposed framework", "procedure", "development", "algorithm", "workflow", # IEEE common "dataset", "data collection", "data preprocessing", "training", "testing setup", "experimental setup", "feature extraction", "model construction", "network structure", "network model", "model architecture", "fasternet", "yolov5", "cnn", "resnet", "classifier" ]): return "methodology" # -------------------------------------------------------- # RESULTS # -------------------------------------------------------- if any(k in t for k in [ "results", "evaluation", "experiment", "analysis", "performance", "validation", "discussion", "findings", "testing", "comparison", # IEEE common "experimental results", "performance comparison", "ablation study", "benchmark", "case study", "accuracy analysis", "result analysis" ]): return "results" # -------------------------------------------------------- # CONCLUSION # -------------------------------------------------------- if any(k in t for k in [ "conclusion", "conclusions", "future work", "future research", "summary", "limitations", "recommendation", "recommendations", "closing remarks" ]): return "conclusion" return "other" # ============================================================ # 14) FALLBACK SPLIT BY KEYWORDS (IMPROVED) # ============================================================ def fallback_split_by_keywords(text): text_lower = text.lower() def find_pos(keyword_list): positions = [] for kw in keyword_list: pos = text_lower.find(kw) if pos != -1: positions.append(pos) if positions: return min(positions) return -1 intro_pos = find_pos([ "introduction", "background", "motivation", "overview" ]) method_pos = find_pos([ "methodology", "methods", "materials and methods", "proposed method", "proposed framework", "proposed system", "system design", "framework", "architecture", "implementation", "algorithm", "workflow" ]) results_pos = find_pos([ "results", "experimental results", "evaluation", "performance evaluation", "experiment", "experiments", "analysis", "discussion", "findings", "testing" ]) concl_pos = find_pos([ "conclusion", "conclusions", "future work", "summary", "concluding remarks", "final remarks", "limitations" ]) positions = [ ("introduction", intro_pos), ("methodology", method_pos), ("results", results_pos), ("conclusion", concl_pos) ] positions = [ (name, pos) for name, pos in positions if pos != -1 ] positions = sorted( positions, key=lambda x: x[1] ) # -------------------------------------------------------- # No headings detected # -------------------------------------------------------- if len(positions) == 0: n = len(text) return { "introduction": text[:int(n * 0.30)].strip(), "methodology": text[int(n * 0.30):int(n * 0.60)].strip(), "results": text[int(n * 0.60):int(n * 0.85)].strip(), "conclusion": text[int(n * 0.85):].strip() } imrad = { "introduction": "", "methodology": "", "results": "", "conclusion": "" } for i, (name, start) in enumerate(positions): end = ( positions[i + 1][1] if i < len(positions) - 1 else len(text) ) chunk = text[start:end].strip() imrad[name] = chunk # -------------------------------------------------------- # Safety fallback # -------------------------------------------------------- if not imrad["introduction"]: imrad["introduction"] = text[:1500] if not imrad["conclusion"]: imrad["conclusion"] = text[-1500:] return imrad # ============================================================ # 15) SPLIT IMRAD USING HEADINGS # ============================================================ def split_into_imrad_sections(clean_body_text): clean_body_text = safe_str(clean_body_text) headings = detect_section_headings(clean_body_text) if len(headings) == 0: return fallback_split_by_keywords(clean_body_text) for i in range(len(headings)): if i < len(headings) - 1: headings[i]["end"] = headings[i + 1]["start"] else: headings[i]["end"] = len(clean_body_text) imrad = { "introduction": "", "methodology": "", "results": "", "conclusion": "" } other_chunks = [] for h in headings: title = safe_str(h.get("title", "")).strip() chunk = clean_body_text[h["start"]:h["end"]].strip() # cleanly remove the heading title itself from the paragraph chunk = re.sub( r"^\s*([IVX]{1,6}|[0-9]{1,3})\.?\s*" + re.escape(title) + r"\s*", "", chunk, flags=re.IGNORECASE ).strip() category = map_heading_to_imrad(title) if category in imrad: if imrad[category]: imrad[category] += "\n\n" imrad[category] += chunk else: other_chunks.append(chunk) # clean up extra spaces for key in imrad: imrad[key] = re.sub(r"\n{3,}", "\n\n", imrad[key]).strip() # smart content recovery for missing sections if not imrad["methodology"]: for chunk in other_chunks: if any(k in chunk.lower() for k in ["proposed method", "framework", "architecture", "dataset", "training"]): imrad["methodology"] = chunk break if not imrad["results"]: for chunk in other_chunks: if any(k in chunk.lower() for k in ["accuracy", "experiment", "evaluation", "performance"]): imrad["results"] = chunk break if not imrad["conclusion"]: for chunk in reversed(other_chunks): if any(k in chunk.lower() for k in ["conclusion", "future work", "summary", "limitation"]): imrad["conclusion"] = chunk break # hard fallback if the mapping completely failed empty_count = sum(1 for v in imrad.values() if not v.strip()) if empty_count >= 3: return fallback_split_by_keywords(clean_body_text) # fix the "giant introduction" bug for weird ieee formatting intro_len = len(imrad["introduction"]) if intro_len > 6000 and len(imrad["methodology"]) < 500: # if intro is massive but method is empty, split it manually half = intro_len // 2 imrad["methodology"] = imrad["introduction"][half:] imrad["introduction"] = imrad["introduction"][:half] return imrad # ============================================================ # 16) MAIN PIPELINE FUNCTION (PHASE 1) # ============================================================ def run_phase1_pipeline(pdf_path): raw_text = extract_text_from_pdf(pdf_path) if not raw_text.strip(): raise ValueError("❌ Extracted PDF text is empty (This PDF may be scanned, OCR is required).") cleaned_text = clean_extracted_text(raw_text) paper_id = os.path.splitext(os.path.basename(pdf_path))[0] title = extract_title(cleaned_text) doi = extract_doi(cleaned_text) year = extract_year(cleaned_text) abstract = extract_abstract(cleaned_text) keywords = extract_keywords(cleaned_text) references = extract_references(cleaned_text) authors, affiliation = extract_authors_affiliation(cleaned_text) # remove keywords + references before IMRAD split clean_body_text = remove_keywords_and_references(cleaned_text) # split IMRAD imrad_sections = split_into_imrad_sections(clean_body_text) imrad_sections = { "introduction": imrad_sections.get("introduction", "").strip(), "methodology": imrad_sections.get("methodology", "").strip(), "results": imrad_sections.get("results", "").strip(), "conclusion": imrad_sections.get("conclusion", "").strip() } metadata = { "paper_id": paper_id, "source_file": os.path.basename(pdf_path), "processed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "title": title, "authors": authors, "affiliation": affiliation, "doi": doi, "year": year, "raw_text_length": len(raw_text), "cleaned_text_length": len(cleaned_text), "body_length": len(clean_body_text), "imrad_detected": { "intro_len": len(imrad_sections["introduction"]), "method_len": len(imrad_sections["methodology"]), "results_len": len(imrad_sections["results"]), "conclusion_len": len(imrad_sections["conclusion"]) }, "notes": { "preserve_tables": True, "preserve_linebreaks": True, "preserve_numbering": True, "abstract_detected": True if abstract else False, "doi_detected": True if doi else False } } return { "paper_id": paper_id, "title": title, "doi": doi, "year": year, "keywords": keywords, "abstract": abstract, "cleaned_text": clean_body_text, "imrad_sections": imrad_sections, "references": references, "metadata": metadata, "status": "success" } # ============================================================ # QUICK TEST # ============================================================ if __name__ == "__main__": test_pdf = "sample.pdf" if os.path.exists(test_pdf): out = run_phase1_pipeline(test_pdf) print("\n========== PHASE 1 OUTPUT TEST ==========") print("PAPER ID:", out["paper_id"]) print("TITLE:", out["title"]) print("AUTHORS:", out["metadata"]["authors"]) print("AFFILIATION:", out["metadata"]["affiliation"]) print("DOI:", out["doi"]) print("YEAR:", out["year"]) print("KEYWORDS:", out["keywords"]) print("ABSTRACT LEN:", len(out["abstract"])) print("INTRO LEN:", len(out["imrad_sections"]["introduction"])) print("METHOD LEN:", len(out["imrad_sections"]["methodology"])) print("RESULTS LEN:", len(out["imrad_sections"]["results"])) print("CONCLUSION LEN:", len(out["imrad_sections"]["conclusion"])) print("REFERENCES LEN:", len(out["references"])) print("========================================\n") else: print("❌ sample.pdf not found for testing.")