Spaces:
Sleeping
Sleeping
| import spacy | |
| from flask import Flask, request, jsonify | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| app = Flask(__name__) | |
| nlp = spacy.load("de_core_news_sm") | |
| def index(): | |
| return ''' | |
| <form action="/reverse" method="post"> | |
| Enter text: <input name="text" /> | |
| <input type="submit" /> | |
| </form> | |
| ''' | |
| def reverse(): | |
| text = request.form.get('text', '') | |
| return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>" | |
| def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50): | |
| text = request.form.get('text', '') | |
| doc = nlp(text) | |
| # Enhanced tokenization and lemmatization with POS tags | |
| tokens_and_lemmas = [ | |
| { | |
| "token": token.text, | |
| "lemma": token.lemma_, | |
| "pos": token.pos_, | |
| "dep": token.dep_, # Dependency parsing | |
| "is_stop": token.is_stop, | |
| } | |
| for token in doc | |
| if not token.is_punct | |
| ] | |
| # Enhanced named entity recognition with additional metadata | |
| entities = [ | |
| { | |
| "text": ent.text, | |
| "label": ent.label_, | |
| "start_char": ent.start_char, | |
| "end_char": ent.end_char, | |
| "description": spacy.explain(ent.label_), # Get explanation of entity type | |
| } | |
| for ent in doc.ents | |
| ] | |
| # Extract key phrases and noun chunks | |
| noun_chunks = [ | |
| {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_} | |
| for chunk in doc.noun_chunks | |
| ] | |
| preprocessed_data = { | |
| "tokens_and_lemmas": tokens_and_lemmas, | |
| "entities": entities, | |
| "noun_chunks": noun_chunks, | |
| "text": text, | |
| } | |
| # Split while preserving page and line markers | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=max_chunk_size, | |
| chunk_overlap=overlap, | |
| separators=["\n[PAGE", "\n", " "], | |
| ) | |
| chunks = splitter.split_text(text) | |
| return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data}) | |
| def preprocess_text_with_nlp_pymupdf(): | |
| """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction""" | |
| text = request.form.get('text', '') | |
| doc = nlp(text) | |
| # Tokenization, lemmatization, and POS tagging | |
| tokens_and_lemmas = [ | |
| { | |
| "token": token.text, | |
| "lemma": token.lemma_, | |
| "pos": token.pos_, | |
| "dep": token.dep_, | |
| "is_stop": token.is_stop, | |
| } | |
| for token in doc | |
| if not token.is_punct | |
| ] | |
| # Named entity recognition | |
| entities = [ | |
| { | |
| "text": ent.text, | |
| "label": ent.label_, | |
| "start_char": ent.start_char, | |
| "end_char": ent.end_char, | |
| "description": spacy.explain(ent.label_), | |
| } | |
| for ent in doc.ents | |
| ] | |
| # Noun chunks | |
| noun_chunks = [ | |
| {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_} | |
| for chunk in doc.noun_chunks | |
| ] | |
| return jsonify({ | |
| "tokens_and_lemmas": tokens_and_lemmas, | |
| "entities": entities, | |
| "noun_chunks": noun_chunks, | |
| "text": text, | |
| }) | |
| def recursive_character_text_splitter(): | |
| text = request.form.get('text', '') | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "] | |
| ) | |
| chunks = splitter.split_text(text) | |
| return jsonify({"chunks": chunks}) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860) | |