Skip to content

Conll


conll_gen(video_id, string_text, language)

Generate CoNLL-U format parsing for input text.

Parameters:

Name Type Description Default
video_id str

Identifier for the video

required
string_text str

Input text to be parsed

required
language str

Language code for the text

required

Returns:

Type Description
list

Parsed CoNLL-U sentences

Source code in apps/annotator/code/text_processor/conll.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def conll_gen(video_id:str,string_text:str, language:str):
    """
    Generate CoNLL-U format parsing for input text.

    Parameters
    ----------
    video_id : str
        Identifier for the video
    string_text : str
        Input text to be parsed
    language : str
        Language code for the text

    Returns
    -------
    list
        Parsed CoNLL-U sentences
    """
    # checks whether the conll is already on server
    conll = get_conll(video_id)

    if conll is not None:
        return parse(conll)

    # requests the conll from an api
    # aggiunto ita
    #files = {
    #    'data': text,
    #    'model': (None, 'english-ewt-ud-2.4-190531'),
    #    'tokenizer': (None, ''),
    #    'tagger': (None, ''),
    #    'parser': (None, ''),
    #}
    files = {
        'data': string_text,
        'model': (None, CONLL._models[language]),
        'tokenizer': (None, ''),
        'tagger': (None, ''),
        'parser': (None, ''),
    }
    r = requests.post('http://lindat.mff.cuni.cz/services/udpipe/api/process', files=files)
    re = r.json()

    # json da salvare
    conll = re['result']
    insert_conll_MongoDB({'video_id':video_id, 'conll':conll})
    return parse(conll)

get_text(video_id, return_conll=False)

Retrieve text from stored CoNLL-U format.

Parameters:

Name Type Description Default
video_id str

Identifier for the video

required
return_conll bool

If True, returns both text and CoNLL-U format

False

Returns:

Type Description
str or tuple

Text string if return_conll=False Tuple of (text, conll) if return_conll=True

None

If no CoNLL data found for video_id

Source code in apps/annotator/code/text_processor/conll.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def get_text(video_id:str, return_conll:bool=False):
    """
    Retrieve text from stored CoNLL-U format.

    Parameters
    ----------
    video_id : str
        Identifier for the video
    return_conll : bool, default=False
        If True, returns both text and CoNLL-U format

    Returns
    -------
    str or tuple
        Text string if return_conll=False
        Tuple of (text, conll) if return_conll=True
    None
        If no CoNLL data found for video_id
    """
    conll = get_conll(video_id)

    if conll is None:
        return None

    parsed = parse(conll)
    text = " ".join([sentence.metadata['text'] for sentence in parsed])
    if return_conll:
        return text, conll
    return text

html_interactable_transcript_legacy(subtitles, conll_sentences, language)

Create an interactive HTML transcript with word-level annotations.

Parameters:

Name Type Description Default
subtitles list

List of subtitle dictionaries with text and timing

required
conll_sentences list

Parsed CoNLL-U sentences

required
language str

Language code for the text

required

Returns:

Type Description
tuple

(lemmatized_subtitles, all_lemmas) where: - lemmatized_subtitles: list of dicts with HTML-formatted text - all_lemmas: list of unique lemmas found in text

Source code in apps/annotator/code/text_processor/conll.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def html_interactable_transcript_legacy(subtitles:list, conll_sentences:list, language:str):
    """
    Create an interactive HTML transcript with word-level annotations.

    Parameters
    ----------
    subtitles : list
        List of subtitle dictionaries with text and timing
    conll_sentences : list
        Parsed CoNLL-U sentences
    language : str
        Language code for the text

    Returns
    -------
    tuple
        (lemmatized_subtitles, all_lemmas) where:
        - lemmatized_subtitles: list of dicts with HTML-formatted text
        - all_lemmas: list of unique lemmas found in text
    """
    from text_processor.words import SemanticText
    #lemmatizer = WordNetLemmatizer()
    sem_text = SemanticText("",language)

    sent_id = 0
    word_id = 0
    word_counter = 0

    all_lemmas = []
    lemmatized_subtitles = []

    for sub in subtitles:
        sent = {"text": ""}
        text:str = sub["text"]
        text = text.replace("\n", " ").replace("’", " ’").replace("'", " '")
        in_phrase_word_indx = 0


        # # aggiungo uno spazio vuoto prima e dopo la punteggiatura
        # text = re.sub('([.,!?():])', r' \1 ', text)
        # text = re.sub('\s{2,}', ' ', text)
        text = text.replace("/", " / ")#.replace("-", " - ")
        text = text.replace("'", " '").replace("’", " ’").replace("”", " ” ").replace("“", " “ ")
        if language == "it":
            text = text.replace("l '","l' ")
        #text_words = text.split(" ")
        text_words = re.split(' |-', text)
        #print(text_words)
        for w in text_words:
            sentence_finished = False
            if w != '':

                if w not in [".",":","?","!",",",";","/","“","'",'"',"”"]:

                    text_word = w.lower().translate(str.maketrans('', '', string.punctuation))\
                        .replace('”', '').replace("“", "").replace('…', '')

                    # there is a bug where sent_id value is over the len of conll_senteces
                    # i add this if to solve it
                    if sent_id < len(conll_sentences):
                        conll_words = conll_sentences[sent_id].filter(upos=lambda x: x != "PUNCT")

                    for i, c in enumerate(conll_words):
                        if "-" in c["form"]:
                            conll_words.insert(i, c.copy())
                            conll_words[i]["form"] = conll_words[i]["form"].split("-")[0]
                            conll_words[i+1]["form"] = conll_words[i+1]["form"].split("-")[1]

                    for i in range(word_counter, len(conll_words)):

                        conll_word = str(conll_words[i]["form"]).lower().translate(str.maketrans('', '', string.punctuation))
                        #print( conll_word, text_word)

                        if text_word == conll_word:
                            word_id = conll_words[i]["id"]
                            word_counter += 1

                            if conll_words[i]["id"] == conll_words[-1]["id"]:
                                sentence_finished = True
                            break

                if w in ["!", "?", "."]:
                    s = sent_id
                else:
                    s = sent_id + 1

                #print(s)
                toLemmatize = w.lower()
                if toLemmatize[-1] in ["?", ".", "!", ";", ","]:
                    toLemmatize = toLemmatize[:-1]
                sem_text.set_text(toLemmatize)
                lemma = sem_text.lemmatize()[0]
                #lemma = lemmatizer.lemmatize(toLemmatize)
                #print(toLemmatize, lemma)
                #print(sub)

                sent["text"] += f'<span lemma="{lemma}" sent_id="{str(s)}" word_id="{str(word_id)}" start_time="{sub["start"]}" end_time="{sub["end"]}" >' + w + '</span> '


                if lemma not in all_lemmas:
                    all_lemmas.append(lemma)

                if sentence_finished:
                    sent_id += 1
                    word_id = 0
                    word_counter = 0
                in_phrase_word_indx += 1

        lemmatized_subtitles.append(sent)

    return lemmatized_subtitles, all_lemmas

html_interactable_transcript_word_level(sentences)

Create word-level interactive HTML transcript with detailed linguistic annotations.

Parameters:

Name Type Description Default
sentences list

List of sentence dictionaries containing word information

required

Returns:

Type Description
list

List of dictionaries containing HTML-formatted sentences with linguistic annotations (lemma, POS, gender, number) and timing information for each word

Notes

Special handling implemented for:

  • Articulated prepositions with apostrophe

  • Verbs with cyclic pronouns

  • Words with punctuation

  • Website URLs

Source code in apps/annotator/code/text_processor/conll.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def html_interactable_transcript_word_level(sentences:list):
    """
    Create word-level interactive HTML transcript with detailed linguistic annotations.

    Parameters
    ----------
    sentences : list
        List of sentence dictionaries containing word information

    Returns
    -------
    list
        List of dictionaries containing HTML-formatted sentences with
        linguistic annotations (lemma, POS, gender, number) and timing
        information for each word

    Notes
    -----
    Special handling implemented for:\n
    - Articulated prepositions with apostrophe\n
    - Verbs with cyclic pronouns\n
    - Words with punctuation\n
    - Website URLs
    """
    html_lemmatized_sents = []
    for sent_id, sentence in enumerate(sentences):
        html_sent = []
        for word_id, word in enumerate(sentence["words"]):
            word_text = word["word"]
            html_sent += [f'<span lemma="{word["lemma"]}"' +
                                f' sent_id="{str(sent_id)}"' +
                                f' word_id="{str(word_id)}"' +
                                f' start_time="{word["start"]}"' +
                                f' end_time="{word["end"]}"' +
                                f' cpos="{word["cpos"]}"' +
                                f' pos="{word["pos"]}"' +
                                f' gen="{word["gen"]}"' +
                                f' num="{word["num"]}" >' +
                                f'{word_text}' +
                            '</span>', " "]
            # if articulated preposition with apostrophe eg. "dell'" 
            # or verb with after a cyclic pronoun eg. "specchiar-si" 
            # or word that has a punctuation mark after
            # or websites (saved as "www.google.com" and words ["www",".google",".com"])
            # don't add space between
            if  word["word"].endswith("'") or \
                (word["cpos"] == "PUNCT" and word["word"] != ",") or \
                word["cpos"] == "NUM" or \
                (word_id + 1 < len(sentence["words"]) and ( sentence["words"][word_id+1]["cpos"] == "X" or \
                                                            sentence["words"][word_id+1]["pos"] in ["FC","FF","FS"] or \
                                                            sentence["words"][word_id+1]["word"].startswith(".") or \
                                                            sentence["words"][word_id+1]["word"].startswith("'") or \
                                                            sentence["words"][word_id+1]["cpos"] == "PUNCT" or \
                                                            (word["cpos"] == "V"  and sentence["words"][word_id+1]["pos"] == "PC"))):
               html_sent.pop()
        html_lemmatized_sents.append({"text": "".join(html_sent)})
    return html_lemmatized_sents