colab에서 깃클론하기
import os
import re
# def rule(x):
# # 괄호
# a = re.compile(r'\([^)]*\)')
# # 문장 부호
# b = re.compile('[^가-힣 ]')
# x = re.sub(pattern=a, repl='', string= x)
# x = re.sub(pattern=b, repl='', string= x)
# return x
# for filename in [_ for _ in os.listdir("KsponSpeech_0001") if _.endswith('.txt')]:
# newText = ""
# with open(os.path.join("KsponSpeech_0001", filename), 'r', encoding='cp949') as f:
# text = f.read()
# newText = rule(text)
# with open(os.path.join("KsponSpeech_0001", filename), 'w', encoding='cp949') as f:
# f.write(newText)
# print(os.path)
with open("train.txt", 'w', encoding='utf-8') as f2:
for filename in [_ for _ in os.listdir("KsponSpeech_0001") if _.endswith('.txt')]:
with open(os.path.join("KsponSpeech_0001", filename), 'r', encoding='cp949') as f:
text = f.read()
f2.write(os.path.realpath(filename) + '\t' + text + '\n')
Shell
복사