Advanced write in a TXT file
Split file according to lines numbers
Here below is not right about Pandas, but usefull in some Pandas contexts.
First we count the number of lines in the file (from the recurrence of \n
), then we split it in files with 50,000 lines if it exceeds 50,000 lines.
with open(workDirectory+'FixQueries.sql', 'r') as myfile: data = myfile.read() taille_fichier = data.count("\n") lines_max = 50000 numero_fichier = 0 if taille_fichier > lines_max: print('Attention : le fichier de sortie fait '+str(taille_fichier)+' lignes ! Veuillez patienter...') smallfile = None with open(workDirectory+'FixQueries.sql') as bigfile: for lineno, line in enumerate(bigfile): if lineno % lines_max == 0: if smallfile: smallfile.close() numero_fichier += 1 small_filename = workDirectory + 'FixQueries {}.sql'.format(numero_fichier) smallfile = open(small_filename, "w") smallfile.write(line) if smallfile: smallfile.close() print('Nous l\'avons découpé en ', ceil(taille_fichier/lines_max), 'fichiers !\n') file1.close() os.remove(workDirectory+'FixQueries.sql')
And then merge the files:
filenames = ['C:/_gh/0/file_25000.txt', 'C:/_gh/0/file_50000.txt', 'C:/_gh/0/file_75000.txt', 'C:/_gh/0/file_100000.txt', 'C:/_gh/0/file_125000.txt'] with open('C:/_gh/0/CUMUL1.txt', 'w') as outfile: for names in filenames: with open(names) as infile: outfile.write(infile.read())
Maybe you will need to list the names of the files before to merge them, with PowerShell (Alt+F+R):
get-childitem | select-object -expandproperty name > _files_list.txt
- Attention 👈
- Be careful to keep the order of the file, to keep the last line empty.
-
Replace text in a file
my_file = r'C:/_gh/0/_Urbanisme/test.txt' replacements = [ ('heros', 'héros'), ('Batm?n', 'Batman'), ('télévisee', 'télévisée'), (' s attaquent', ' s\'attaquent') ] with open(my_file, 'r') as file: data = file.read() for a, b in replacements: data = data.replace(a, b) with open(my_file, 'w') as file: file.write(data) # Confirm print("OK !")
This is not right related to Pandas, but it is very useful to store some special note to remember during a data process, and automate it.
Create/replace a TXT file and write something into
Use the Write option (w
):
MyTxtFile = 'C:/_gh/0/My beautiful file.txt' with open(MyTxtFile, 'w') as file: file.write('Blablabla...\n') file.write(MyVariable)
Add some lines at the start of an existing file
MyTxtFile = 'C:/_gh/0/My beautiful file.txt' with open(MyTxtFile, 'r') as original: data = original.read() with open(MyTxtFile, 'w') as modified: modified.write('XPTDR\n' + data)
Add some lines at the end of an existing file
Use the Append option (a
):
MyTxtFile = 'C:/_gh/0/My beautiful file.txt' with open(MyTxtFile, 'a') as file: file.write('Blablabla...\n') file.write(MyVariable)
Replace some lines in an existing file
Here we first Read (r
option) and store the content of the file, and then re-create it removing the lines we would delete:
MyTxtFile = 'C:/_gh/0/My beautiful file.txt' with open(MyTxtFile, 'r') as f: lines = f.readlines() with open(MyTxtFile, 'w') as f: for line in lines: if not line.strip('\n').startswith('First line to delete...') : if not line.strip('\n').startswith('Second line to delete...') : if not line.strip('\n').startswith('Third line to delete...') : f.write(line) with open(MyTxtFile, 'a') as file: file.write('New line 1') file.write('New line 2') file.write('New line 3')
Replace a simple string in a TXT file
MyTxtFile = 'C:/_gh/0/My beautiful file.txt' with open(MyTxtFile, 'r') as file : filedata = file.read() filedata = filedata.replace('String 1', 'String 2') with open(MyTxtFile, 'w') as file: file.write(filedata)
Replace a string with a regular expression (regex) in a TXT file
FichierTest = 'C:/_gh/0/FichierTest.txt' import re # Find number : r'[0-9]' # Find line break : r'[\n]' # Find double line break : r'[\r\n]{2,}' with open(FichierTest, 'r+') as file: text = file.read() text = re.sub(r'[\r\n]{2,}', '\n\n', text) file.seek(0, 0) # seek to beginning file.write(text) file.truncate() # get rid of any trailing characters
Merge 2 txt files
MyTxtFile1 = 'C:/_gh/0/My beautiful file 1.txt' MyTxtFile2 = 'C:/_gh/0/My beautiful file 2.txt' MyTxtFileMerged = 'C:/_gh/0/My beautiful file merged.txt' with open(MyTxtFile1, 'r') as _MyTxtFile1: dataMyTxtFile1 = _MyTxtFile1.read() with open(MyTxtFile2, 'r') as _MyTxtFile2: dataMyTxtFile2 = _MyTxtFile2.read() with open(MyTxtFileMerged, 'w', encoding='utf-8') as _MyTxtFileMerged: _MyTxtFileMerged.write(dataMyTxtFile1) _MyTxtFileMerged.write(dataMyTxtFile2) os.remove(MyTxtFile1) os.remove(MyTxtFile2)
Test if a comma is used as decimal separator in a TXT file
with open(MyTxtFile, 'r') as myfile: data = myfile.read() pb_regex = re.findall(r'(?:\t|^)\d+\K,(?=\d+(?:\t|$))', data) if pb_regex: print(colored('\nAttention, comma is used as a decimal separator in numeric value!\n' + 'Find them in Notepad with:\n(?:\\t|^)\d+\K,(?=\d+(?:\\t|$))', 'yellow')) else: pass
Write sentences from dataframe
MyTxtFile = WorkDirectory + ' - My file.txt' with open(MyTxtFile, 'w') as file: file.write('\nSpecial cases:\n\n') Assemblage = list(zip(df['Field1'].tolist(), df['Field2'].tolist(), df['Field3'].tolist(), df['Field4'].tolist())) b = [] for a in Assemblage: b.append('- ' + str(list(a)[0]) + ' ' + str(list(a)[1]) + ' (' + str(list(a)[2]) + ') in ' + str(list(a)[3])) for i in b: file.write(i + '\n')
Convert encoding
With chardet
:
from chardet import detect def get_encoding_type(file): with open(file, 'rb') as f: rawdata = f.read() return detect(rawdata)['encoding'] from_codec = get_encoding_type(MyOriginalFile) if from_codec != 'utf-8': print(colored('Attention, script is ' + from_codec + '!', 'yellow')) try: with open(, 'r', encoding=from_codec) as f, open(MyNewFile, 'w', encoding='utf-8') as e: text = f.read() e.write(text) os.remove(MyOriginalFile) os.rename(MyNewFile, MyOriginalFile) print(colored('We converted the file as UTF-8.', 'yellow')) except UnicodeDecodeError: print('Decode Error') except UnicodeEncodeError: print('Encode Error')
With io
:
import io with io.open(OriginalTxtFile, mode='r', encoding='utf8') as fd: content = fd.read() with io.open(FinalTxtFile, mode='w', encoding='cp1252') as fd: fd.write(content)
Replace a string with a dataframe in an existing txt file
Do a replace
with a df.to_string()
, maybe with a little regex to replace unwanted spaces.
with open(MtTxtFile, 'r') as file: MyTxtData = MyTxtData.replace('STRING TO REPLACE', df.to_string(header=False, index=False, justify='left')) MyTxtData = re.sub(r'^ *', '', MyTxtData, flags=re.MULTILINE) with open(MtTxtFile, 'w') as file: file.write(MyTxtData)