Spaces:

mrciomnl
/

Docubot-PDF_Analyzer

Sleeping

App Files Files Community

Docubot-PDF_Analyzer / analyzer.py

mrciomnl

New Branch

929fd26 9 months ago

raw

history blame contribute delete

2.95 kB

	import pdfplumber
	import spacy
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.text_rank import TextRankSummarizer
	import re
	import numpy as np

	# Load spaCy transformer model
	nlp = spacy.load("en_core_web_trf")

	# === Text Cleaning ===
	def clean_text(text):
	text = re.sub(r"•", "", text)
	text = re.sub(r"[^\x00-\x7F]+", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text.strip()

	# === PDF Extraction ===
	def extract_text_from_pdf(file):
	with pdfplumber.open(file) as pdf:
	text = ""
	for page in pdf.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"
	return clean_text(text)

	# === Named Entity Recognition ===
	def perform_ner(text):
	doc = nlp(text)
	return {
	"people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
	"places": [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}],
	"organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"]
	}

	# === TF-IDF Relevance ===
	def get_relevant_chunks(query, text, num_chunks=5):
	sentences = [sent.text.strip() for sent in nlp(text).sents if len(sent.text.strip()) > 10]
	vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
	tfidf_matrix = vectorizer.fit_transform(sentences + [query])
	cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
	indices = cosine_sim.argsort()[0, -num_chunks:][::-1]
	return [sentences[i] for i in indices]

	# === Summary Cleanup ===
	def deduplicate(sentences):
	seen = set()
	result = []
	for s in sentences:
	s = s.strip()
	if s not in seen:
	seen.add(s)
	result.append(s)
	return result

	def is_too_technical(s):
	return s.count("=") > 3 or len(s) > 300

	def is_tabular(s):
	return bool(re.match(r'^\d', s)) or len(re.findall(r'\d+', s)) > 6

	def shorten(s, limit=250):
	return s if len(s) <= limit else s[:limit].rsplit(" ", 1)[0] + "..."

	def filter_summary(summary):
	return [shorten(s) for s in deduplicate(summary) if not is_too_technical(s) and not is_tabular(s)]

	# === TextRank Summarizer ===
	def summarize_text(text, num_sentences=10):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = TextRankSummarizer()
	summary = summarizer(parser.document, num_sentences)
	return filter_summary([str(sentence) for sentence in summary])

	# === Top-Level Function ===
	def analyze_pdf(file, query):
	text = extract_text_from_pdf(file)
	entities = perform_ner(text)
	chunks = get_relevant_chunks(query, text)
	summary = summarize_text(text)
	return {
	"entities": entities,
	"relevant_chunks": chunks,
	"summary": summary
	}