Docubot-PDF_Analyzer / analyzer.py
mrciomnl's picture
New Branch
929fd26
import pdfplumber
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import re
import numpy as np
# Load spaCy transformer model
nlp = spacy.load("en_core_web_trf")
# === Text Cleaning ===
def clean_text(text):
text = re.sub(r"•", "", text)
text = re.sub(r"[^\x00-\x7F]+", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
# === PDF Extraction ===
def extract_text_from_pdf(file):
with pdfplumber.open(file) as pdf:
text = ""
for page in pdf.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
return clean_text(text)
# === Named Entity Recognition ===
def perform_ner(text):
doc = nlp(text)
return {
"people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
"places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}],
"organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
}
# === TF-IDF Relevance ===
def get_relevant_chunks(query, text, num_chunks=5):
sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10]
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(sentences + [query])
cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
indices = cosine_sim.argsort()[0, -num_chunks:][::-1]
return [sentences[i] for i in indices]
# === Summary Cleanup ===
def deduplicate(sentences):
seen = set()
result = []
for s in sentences:
s = s.strip()
if s not in seen:
seen.add(s)
result.append(s)
return result
def is_too_technical(s):
return s.count("=") > 3 or len(s) > 300
def is_tabular(s):
return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6
def shorten(s, limit=250):
return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..."
def filter_summary(summary):
return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)]
# === TextRank Summarizer ===
def summarize_text(text, num_sentences=10):
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, num_sentences)
return filter_summary([str(sentence) for sentence in summary])
# === Top-Level Function ===
def analyze_pdf(file, query):
text = extract_text_from_pdf(file)
entities = perform_ner(text)
chunks = get_relevant_chunks(query, text)
summary = summarize_text(text)
return {
"entities": entities,
"relevant_chunks": chunks,
"summary": summary
}