Spaces:
Running
Running
Commit ·
251ecbb
1
Parent(s): 26ab567
lng indices
Browse files- const.py +1 -0
- lng/L2SCA/analyzeText.py +87 -35
const.py
CHANGED
|
@@ -1030,6 +1030,7 @@ used_indices = [
|
|
| 1030 |
63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
|
| 1031 |
257, 258, 261, 263, 272, 274
|
| 1032 |
]
|
|
|
|
| 1033 |
|
| 1034 |
eval_indices = [4,5,6,18,257,272]
|
| 1035 |
eval_indices = [used_indices.index(idx) for idx in eval_indices]
|
|
|
|
| 1030 |
63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
|
| 1031 |
257, 258, 261, 263, 272, 274
|
| 1032 |
]
|
| 1033 |
+
lftk_used_indices = [1, 7, 8, 9, 10, 11, 12, 17, 65, 68, 73, 78, 80, 198, 201, 202, 205, 207, 216, 218]
|
| 1034 |
|
| 1035 |
eval_indices = [4,5,6,18,257,272]
|
| 1036 |
eval_indices = [used_indices.index(idx) for idx in eval_indices]
|
lng/L2SCA/analyzeText.py
CHANGED
|
@@ -62,40 +62,94 @@ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
|
|
| 62 |
pre_path = 'lng/L2SCA'
|
| 63 |
|
| 64 |
#location of the Stanford parser
|
| 65 |
-
parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04
|
| 66 |
|
| 67 |
def sca(input_text):
|
| 68 |
-
inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
|
| 69 |
-
with open(inputFile, 'w') as f:
|
| 70 |
-
f.write(input_text + '\n')
|
| 71 |
-
|
| 72 |
-
#extract the name of the file being processed
|
| 73 |
output = []
|
| 74 |
-
|
| 75 |
-
#
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
#
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
for pattern in patternlist:
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
count = 0
|
| 97 |
patterncount.append(count)
|
| 98 |
|
|
|
|
| 99 |
#update frequencies of complex nominals, clauses, and T-units
|
| 100 |
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
| 101 |
patterncount[2]=patterncount[2]+patterncount[-3]
|
|
@@ -103,10 +157,12 @@ def sca(input_text):
|
|
| 103 |
patterncount[1]=patterncount[1]+patterncount[-1]
|
| 104 |
|
| 105 |
#word count
|
| 106 |
-
infile=open(parsedFile,"r")
|
| 107 |
-
content=infile.read()
|
| 108 |
-
w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
| 109 |
-
infile.close()
|
|
|
|
|
|
|
| 110 |
|
| 111 |
#add frequencies of words and other structures to output string
|
| 112 |
output.append(int(w))
|
|
@@ -139,8 +195,4 @@ def sca(input_text):
|
|
| 139 |
#list of 24 comma-delimited fields
|
| 140 |
# fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
| 141 |
|
| 142 |
-
#delete the temporary file holding the parse trees
|
| 143 |
-
os.remove(inputFile)
|
| 144 |
-
os.remove(parsedFile)
|
| 145 |
-
|
| 146 |
return output
|
|
|
|
| 62 |
pre_path = 'lng/L2SCA'
|
| 63 |
|
| 64 |
#location of the Stanford parser
|
| 65 |
+
parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04")
|
| 66 |
|
| 67 |
def sca(input_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
output = []
|
| 69 |
+
# inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
|
| 70 |
+
# with open(inputFile, 'w') as f:
|
| 71 |
+
# f.write(input_text + '\n')
|
| 72 |
+
|
| 73 |
+
# #extract the name of the file being processed
|
| 74 |
+
# output = []
|
| 75 |
+
|
| 76 |
+
# #name a temporary file to hold the parse trees of the input file
|
| 77 |
+
# parsedFile=inputFile+".parsed"
|
| 78 |
+
|
| 79 |
+
# #parse the input file
|
| 80 |
+
# command=[parserPath, inputFile]
|
| 81 |
+
# with open(parsedFile, 'w') as f:
|
| 82 |
+
# subprocess.run(command, stdout = f,
|
| 83 |
+
# stderr = subprocess.DEVNULL
|
| 84 |
+
# )
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# #list of counts of the patterns
|
| 88 |
+
# patterncount=[]
|
| 89 |
+
|
| 90 |
+
# #query the parse trees using the tregex patterns
|
| 91 |
+
# for pattern in patternlist:
|
| 92 |
+
# command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
|
| 93 |
+
# out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
|
| 94 |
+
# if len(out.stdout) > 0:
|
| 95 |
+
# count = int(out.stdout)
|
| 96 |
+
# else:
|
| 97 |
+
# count = 0
|
| 98 |
+
# patterncount.append(count)
|
| 99 |
+
|
| 100 |
+
# Parse directly into memory
|
| 101 |
+
stanford_parser_jar = os.path.join(parserPath, "stanford-parser.jar")
|
| 102 |
+
stanford_models_jar = os.path.join(parserPath, "stanford-parser-3.3.1-models.jar") #Correct the version
|
| 103 |
+
|
| 104 |
+
command = [
|
| 105 |
+
"java",
|
| 106 |
+
"-mx1500m",
|
| 107 |
+
"-cp",
|
| 108 |
+
f"{stanford_parser_jar}:{stanford_models_jar}:",
|
| 109 |
+
"edu.stanford.nlp.parser.lexparser.LexicalizedParser",
|
| 110 |
+
"-outputFormat", "penn", # Changed output format to penn
|
| 111 |
+
"-sentences", "newline",
|
| 112 |
+
"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
|
| 113 |
+
"-", # Read from standard input
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
# Parse the input text.
|
| 117 |
+
process = subprocess.run(command, input=input_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
| 118 |
+
parsed_output = process.stdout.decode()
|
| 119 |
+
|
| 120 |
+
# Word count
|
| 121 |
+
w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
|
| 122 |
+
|
| 123 |
+
# For each tregex pattern in the list, run tregex in filter mode
|
| 124 |
+
# so that it reads the parsed trees from standard input.
|
| 125 |
+
patterncount = []
|
| 126 |
for pattern in patternlist:
|
| 127 |
+
tregex_command = [
|
| 128 |
+
"java",
|
| 129 |
+
"-mx100m",
|
| 130 |
+
"-cp",
|
| 131 |
+
f"{pre_path}/stanford-tregex.jar",
|
| 132 |
+
"edu.stanford.nlp.trees.tregex.TregexPattern",
|
| 133 |
+
"-filter", # Use filter mode: read trees from stdin
|
| 134 |
+
"-C", # Suppress printing matches; only outputs the count
|
| 135 |
+
"-o", # Report each tree node only once as the root of a match
|
| 136 |
+
pattern # Supply the pattern as a command-line argument
|
| 137 |
+
]
|
| 138 |
+
tregex_process = subprocess.run(
|
| 139 |
+
tregex_command,
|
| 140 |
+
input=parsed_output.encode(),
|
| 141 |
+
stdout=subprocess.PIPE,
|
| 142 |
+
stderr=subprocess.DEVNULL,
|
| 143 |
+
)
|
| 144 |
+
tregex_output = tregex_process.stdout.decode().strip()
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
count = int(tregex_output)
|
| 148 |
+
except ValueError:
|
| 149 |
count = 0
|
| 150 |
patterncount.append(count)
|
| 151 |
|
| 152 |
+
|
| 153 |
#update frequencies of complex nominals, clauses, and T-units
|
| 154 |
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
| 155 |
patterncount[2]=patterncount[2]+patterncount[-3]
|
|
|
|
| 157 |
patterncount[1]=patterncount[1]+patterncount[-1]
|
| 158 |
|
| 159 |
#word count
|
| 160 |
+
# infile=open(parsedFile,"r")
|
| 161 |
+
# content=infile.read()
|
| 162 |
+
# w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
| 163 |
+
# infile.close()
|
| 164 |
+
w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
|
| 165 |
+
|
| 166 |
|
| 167 |
#add frequencies of words and other structures to output string
|
| 168 |
output.append(int(w))
|
|
|
|
| 195 |
#list of 24 comma-delimited fields
|
| 196 |
# fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
return output
|