amougou-fortiss commited on
Commit
44c4e3b
·
verified ·
1 Parent(s): ee45916

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py CHANGED
@@ -1,6 +1,10 @@
 
 
1
  from flask import Flask, request, jsonify
 
2
 
3
  app = Flask(__name__)
 
4
 
5
  @app.route('/')
6
  def index():
@@ -16,5 +20,97 @@ def reverse():
16
  text = request.form.get('text', '')
17
  return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  if __name__ == '__main__':
20
  app.run(host='0.0.0.0', port=7860)
 
1
+ import spacy
2
+
3
  from flask import Flask, request, jsonify
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
 
6
  app = Flask(__name__)
7
+ nlp = spacy.load("de_core_news_sm")
8
 
9
  @app.route('/')
10
  def index():
 
20
  text = request.form.get('text', '')
21
  return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
22
 
23
+ @app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
24
+ def preprocess_text_with_nlp_llm():
25
+ text = request.form.get('text', '')
26
+ doc = nlp(text)
27
+ # Enhanced tokenization and lemmatization with POS tags
28
+ tokens_and_lemmas = [
29
+ {
30
+ "token": token.text,
31
+ "lemma": token.lemma_,
32
+ "pos": token.pos_,
33
+ "dep": token.dep_, # Dependency parsing
34
+ "is_stop": token.is_stop,
35
+ }
36
+ for token in doc
37
+ if not token.is_punct
38
+ ]
39
+ # Enhanced named entity recognition with additional metadata
40
+ entities = [
41
+ {
42
+ "text": ent.text,
43
+ "label": ent.label_,
44
+ "start_char": ent.start_char,
45
+ "end_char": ent.end_char,
46
+ "description": spacy.explain(ent.label_), # Get explanation of entity type
47
+ }
48
+ for ent in doc.ents
49
+ ]
50
+ # Extract key phrases and noun chunks
51
+ noun_chunks = [
52
+ {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
53
+ for chunk in doc.noun_chunks
54
+ ]
55
+ preprocessed_data = {
56
+ "tokens_and_lemmas": tokens_and_lemmas,
57
+ "entities": entities,
58
+ "noun_chunks": noun_chunks,
59
+ "text": text,
60
+ }
61
+ # Split while preserving page and line markers
62
+ splitter = RecursiveCharacterTextSplitter(
63
+ chunk_size=max_chunk_size,
64
+ chunk_overlap=overlap,
65
+ separators=["\n[PAGE", "\n", " "],
66
+ )
67
+ chunks = splitter.split_text(text)
68
+ return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data)
69
+
70
+ @app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
71
+ def preprocess_text_with_nlp_pymupdf():
72
+ """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
73
+ text = request.form.get('text', '')
74
+ doc = nlp(text)
75
+
76
+ # Tokenization, lemmatization, and POS tagging
77
+ tokens_and_lemmas = [
78
+ {
79
+ "token": token.text,
80
+ "lemma": token.lemma_,
81
+ "pos": token.pos_,
82
+ "dep": token.dep_,
83
+ "is_stop": token.is_stop,
84
+ }
85
+ for token in doc
86
+ if not token.is_punct
87
+ ]
88
+
89
+ # Named entity recognition
90
+ entities = [
91
+ {
92
+ "text": ent.text,
93
+ "label": ent.label_,
94
+ "start_char": ent.start_char,
95
+ "end_char": ent.end_char,
96
+ "description": spacy.explain(ent.label_),
97
+ }
98
+ for ent in doc.ents
99
+ ]
100
+
101
+ # Noun chunks
102
+ noun_chunks = [
103
+ {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
104
+ for chunk in doc.noun_chunks
105
+ ]
106
+
107
+ return {
108
+ "tokens_and_lemmas": tokens_and_lemmas,
109
+ "entities": entities,
110
+ "noun_chunks": noun_chunks,
111
+ "text": text,
112
+ }
113
+
114
+
115
  if __name__ == '__main__':
116
  app.run(host='0.0.0.0', port=7860)