ELRC, European Language Resource Coordination
Documents published on the European Parliament's official website
Available for Download ✅
⚠️ Always check the license of the data source before using the data ⚠️
- Main page: https://elrc-share.eu/
- Data Browse Link: https://elrc-share.eu/repository/search/
- Format: .tmx
metadata, df = tmx2dataframe.read('elrc/citizens_information_en-ga.tmx')
print(len(df))
df.head()
lang='ga'
dir_path = Path(f'elrc')
samp_count=0
for f in progress_bar(list(dir_path.iterdir())):
if f.suffix == '.tmx':
try:
_, df = tmx2dataframe.read(str(f))
# If target_language in dataframe contains the language string (like 'ga')
df.target_language = df.target_language.str.lower()
if len(df[df.target_language.str.contains(lang)]) > 0:
ga_df = df[df.target_language.str.contains(lang)].copy()
ga_df['filepath'] = str(f)
except:pass
#print(f"Couldn't open {f}")
var_exists = 'ga_df' in locals() or 'ga_df' in globals()
if var_exists:
#print(f'{len(ga_df)} samples found in {f}')
samp_count+=len(ga_df)
ga_df.reset_index(inplace=True, drop=True)
ga_df.to_csv(f'{str(f).lower()}.csv')
del ga_df
gc.collect()
#else: print(f'No {lang} text found in {f} ?')
#print()
print(f'{samp_count} total text samples extracted')
lang='ga'
dir_path = Path(f'elrc')
f_list = []
for f in list(dir_path.iterdir()):
if f.suffix == '.csv': f_list.append(f)
for i,f in enumerate(progress_bar(f_list)):
try:
if i == 0: ga_df = pd.read_csv(f, index_col=0)
tmp = pd.read_csv(f, index_col=0)
ga_df = pd.concat([ga_df, tmp])
except:
print(f'Error with opening {f}')
ga_df.reset_index(inplace=True, drop=True)
print(len(ga_df))
ga_df.to_csv('elrc_en-ga_compiled_2020-06-11.csv', index=False)
ga_df.head()
Number source documents:
Number of lines per source document: