import pdfplumber import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer import re import numpy as np # Load spaCy transformer model nlp = spacy.load("en_core_web_trf") # === Text Cleaning === def clean_text(text): text = re.sub(r"•", "", text) text = re.sub(r"[^\x00-\x7F]+", " ", text) text = re.sub(r"\s+", " ", text) return text.strip() # === PDF Extraction === def extract_text_from_pdf(file): with pdfplumber.open(file) as pdf: text = "" for page in pdf.pages: extracted = page.extract_text() if extracted: text += extracted + "\n" return clean_text(text) # === Named Entity Recognition === def perform_ner(text): doc = nlp(text) return { "people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"], "places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}], "organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"] } # === TF-IDF Relevance === def get_relevant_chunks(query, text, num_chunks=5): sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10] vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) tfidf_matrix = vectorizer.fit_transform(sentences + [query]) cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]) indices = cosine_sim.argsort()[0, -num_chunks:][::-1] return [sentences[i] for i in indices] # === Summary Cleanup === def deduplicate(sentences): seen = set() result = [] for s in sentences: s = s.strip() if s not in seen: seen.add(s) result.append(s) return result def is_too_technical(s): return s.count("=") > 3 or len(s) > 300 def is_tabular(s): return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6 def shorten(s, limit=250): return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..." def filter_summary(summary): return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)] # === TextRank Summarizer === def summarize_text(text, num_sentences=10): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = TextRankSummarizer() summary = summarizer(parser.document, num_sentences) return filter_summary([str(sentence) for sentence in summary]) # === Top-Level Function === def analyze_pdf(file, query): text = extract_text_from_pdf(file) entities = perform_ner(text) chunks = get_relevant_chunks(query, text) summary = summarize_text(text) return { "entities": entities, "relevant_chunks": chunks, "summary": summary }