Read/export data file (CSV, Excel, MySQL, Postgres, JSON, Shape ...)
Read and display an Excel file
import pandas as pd from tabulate import tabulate df = pd.read_excel('C:/Users/Your file.xlsx', sheet_name='Export', engine='openpyxl', usecols=['My first column', 'My second column']) print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=True))
I set engine='openpyxl'
because it is from an XLSX file. sheet_name
specifies the sheet to import, specify your columns with usecols
.
Then I use tabulate
to display properly my data in a Postgres way (tablefmt='psql'
).
With head(35)
I just display the 35 first lines, but the dataframe here named df
contains all the data.
If you wan to re-organize the columns, please see the Miscellaneous chapter.
Read a CSV file
import pandas as pd from tabulate import tabulate df_subscriber = pd.read_csv(workDirectory+'joo_acymailing_subscriber.csv', sep=';', usecols=['subid', 'source', 'email']) print(tab(df_subscriber.head(10), headers='keys', tablefmt='psql', showindex=False))
Add textual column names from a CSV without column names
Use names=[]
:
import pandas as pd from tabulate import tabulate df = pd.read_csv(inpuMyFilePath, names=['Mon Champ 1', 'Mon Champ 2', 'Mon Champ 3'], sep=',', keep_default_na=False).astype(str) print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))
Read a non-standard Excel file
Sometimes a file has wrong top lines, or you can not trust the future column names.
To read a file from a specific row for example, skipping the last rows.
import pandas as pd from tabulate import tabulate df = pd.read_excel(r'C:/Users/Georges/Downloads/test excel.xlsx', sheet_name='Sheet1', engine='openpyxl', usecols=[0,1], header=5-1, skipfooter=1, names=['id', 'value']) print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=False))
Or skip a specific line (with skiprows=[...]
):
... df = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl', skiprows=[2]) ...
Use first row as label column
my_labels = df.iloc[0] df = df[1:] df.columns = my_labels
Read from MySQL with sqlalchemy
sqlalchemy
is the best way with pandas:
from tabulate import tabulate as tab import pandas as pd from sqlalchemy import create_engine username = 'root' password = '' port = 3308 database = 'my_beautiful_db' host ='localhost' engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database)) sql = 'SELECT COUNT(*) AS total FROM communes ;' df = pd.read_sql_query(sql, engine).set_index('total') print(tab(df.head(5), headers='keys', tablefmt='psql')) engine.dispose()
Specify database encoding
engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (username, password, host, port, database))
Get result from a SQL query with sqlalchemy
for one result
engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (userTarget, passwordTarget, hostTarget, portTarget, databaseTarget)) sql = 'SELECT 1 ;' my_connection = engine.connect() my_query = my_connection.execute(text(sql)).scalar()
Read from MySQL with mysqlclient
import MySQLdb from tabulate import tabulate import pandas as pd mydb = MySQLdb.connect(user='root', password='', database='MyDb', host='localhost', port=3306) query = 'SELECT name FROM users ;' df = pd.read_sql(query, mydb) mydb.close() print(tabulate(df.head(10), headers='keys', tablefmt='psql')) number = df.shape[0] print(number)
Read from MySQL with mysql.connector
import mysql.connector as connection import pandas as pd from tabulate import tabulate as tab mydb = connection.connect(host='XXX.XX.XXX.XX', port=3308, database='MyDb',user='Me', passwd='pwdXXX!456', use_pure=True) try: query = "SELECT firstname, lastname FROM customer ;" df = pd.read_sql(query, mydb) except Exception as e: mydb.close() print(str(e)) print(tab(df.head(5), headers='keys', tablefmt='psql'))
Read from Postgres
import pandas as pd from tabulate import tabulate import psycopg2 con = psycopg2.connect(database='mytransformations', user='postgres', password='', host='localhost', port='5432') print("OK, we are connect to mytransformations") # DISPLAY WITH PANDAS df = pd.read_sql("SELECT * FROM myparcels ;", con) con.close() df.drop('geom', axis=1, inplace=True) print('\nResults:') print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False)) print('Number of records:', df.shape[0])
Read from a JSON
import pandas as pd from tabulate import tabulate import json import requests response = requests.get(MON_URL) my_file = json.loads(response.text) my_json = my_file['data'] mon_df = pd.DataFrame.from_dict(my_json)
Read from a shape
Use geopandas
:
import geopandas as gpd from tabulate import tabulate myShape = 'C:\\Path\\Of\\My\\Shape.shp' print('\n' + myShape) df = gpd.read_file(myShape) df['type'] = df['geometry'].astype(str).str.replace(r' .*|\(.*', '', regex=True) df = df[['id', 'type', 'geometry']] print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=True)) print(df.shape[0]) MyFieldList = df['id'].drop_duplicates().dropna().sort_values(ascending=False).tolist() print('\n' + str(MyFieldList)) MyGeomTypeList = df['type'].drop_duplicates().dropna().sort_values(ascending=False).tolist() print('\n' + str(MyGeomTypeList))
Export a data-frame in a new Excel file
df.to_excel("C:/Users/Your new file.xlsx", index=False, sheet_name='Tab 1')
index=False
to not get the index column.
Export a data-frame in an existing Excel sheet
Use mode='a'
and if_sheet_exists
:
from openpyxl import load_workbook import openpyxl My_File_Path= 'C:/My/File.xlsx' with pd.ExcelWriter(My_File_Path, 'openpyxl', mode='a', if_sheet_exists='replace') as writing: df.to_excel(writing, 'My sheet', index=False)
Export several dataframes in Excel sheets
If you have several data-frame to export in one Excel file, each in unique sheets, you should open an ExcelWriter
. And close it.
writer = pd.ExcelWriter('C:/Users/Your new file.xlsx', engine='xlsxwriter') df_Country.to_excel(writer, index=False, sheet_name='Country', header=['Country', 'Total']) df_Emails.to_excel(writer, index=False, sheet_name='Emails', header=['Emails', 'Total']) writer.save()
Export several dataframes in the same file in specific sheets
writer = pd.ExcelWriter('C:/Users/Your new file.xlsx', engine='xlsxwriter') df.to_excel(writer, index=False, sheet_name='Registrations', startcol=3, startrow=34, header=['Filed', 'Total', '%'])
Export a dataframe as a txt file, with separator and ANSI encoding
df.to_csv(my_file_import, header=None, index=None, sep='\t', encoding='mbcs')
Add filters in all columns in one sheet in an Excel file
import openpyxl from openpyxl.utils import get_column_letter outputExcelFile = r'C:/User/Your file.xlsx' workbook = openpyxl.load_workbook(outputExcelFile) worksheet = workbook['Sheet name'] FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row) worksheet.auto_filter.ref = FullRange workbook.save(outputExcelFile)
Add filters in all columns in all sheets in an Excel file
import openpyxl from openpyxl.utils import get_column_letter outputExcelFile = r'C:/User/Your file.xlsx' workbook = openpyxl.load_workbook(outputExcelFile) sheetsLits = workbook.sheetnames for sheet in sheetsLits: worksheet = workbook[sheet] FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row) worksheet.auto_filter.ref = FullRange workbook.save(outputExcelFile)
Add colors in all column names in a sheet in an Excel file
import openpyxl from openpyxl.styles import PatternFill workbook = openpyxl.load_workbook(My_File_Path) worksheet = workbook['My sheet'] for cell in worksheet[1]: worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid') workbook.save(My_File_Path)
Add colors in all column names in all sheets in an Excel file
import openpyxl from openpyxl.styles import PatternFill outputExcelFile = r'C:/Users/Your File.xlsx' workbook = openpyxl.load_workbook(outputExcelFile) sheetsLits = workbook.sheetnames for sheet in sheetsLits: worksheet = workbook[sheet] for cell in workbook[sheet][1]: worksheet[cell.coordinate].fill = PatternFill(fgColor = 'FFC6C1C1', fill_type = 'solid') workbook.save(outputExcelFile)
Modify a column width in one sheet in an Excel file
import openpyxl outputExcelFile = r'C:/Users/Your File.xlsx' workbook = openpyxl.load_workbook(outputExcelFile) sheetsLits = workbook.sheetnames workbook['Your sheet'].column_dimensions['A'].width = 75 workbook.save(outputExcelFile)
Modify all columns width in all sheets in an Excel file
The code below set the A column size to 30, and the others to 10.
import openpyxl from openpyxl.utils import get_column_letter outputExcelFile = r'C:/Users/Your File.xlsx' workbook = openpyxl.load_workbook(outputExcelFile) sheetsLits = workbook.sheetnames for sheet in sheetsLits: for cell in workbook[sheet][1]: if get_column_letter(cell.column) == 'A': workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 30 else: workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 10 workbook.save(outputExcelFile)
Add color to some line recovering their indexes from condition
list_duplicate = df.index[df['Ligne en doublon'] == 'DOUBLON !'].tolist() for index in list_duplicate: for cell in worksheet[index+2]: worksheet[cell.coordinate].fill = PatternFill(fgColor='FF6666', fill_type='solid') workbook.save(My_File_Path)
Freeze the top row of an Excel file
worksheet.freeze_panes = 'A2'
Freeze the top row and the first column of an Excel file
worksheet.freeze_panes = 'B2'
Freeze the top row and the 2 first columns of an Excel file
worksheet.freeze_panes = 'C2'
Change font color and style in an Excel file
from openpyxl.styles import Font worksheet['A1'].font = Font(color='FFFFFF', italic=False, bold=True)
Work on several Excel files
Here we freeze the top row, add filters, color and change the width of all columns from 3 Excel files.
import openpyxl from openpyxl.utils import get_column_letter from openpyxl.styles import PatternFill File1 = r'D:/Work/My file 1.xlsx' File2 = r'D:/Work/My file 2.xlsx' File3 = r'D:/Work/My file 3.xlsx' ExcelList = [File1, File2, File3] for ExcelFile in ExcelList: workbook = openpyxl.load_workbook(ExcelFile) sheetsLits = workbook.sheetnames for sheet in sheetsLits: worksheet = workbook[sheet] FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row) worksheet.auto_filter.ref = FullRange worksheet.freeze_panes = 'A2' for cell in workbook[sheet][1]: worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid') if get_column_letter(cell.column) == 'A': workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 12 else: workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 20 workbook.save(ExcelFile)
Insert a picture in an Excel file
And delete them after.
img = openpyxl.drawing.image.Image('C:/Users/myplot.png') img.anchor = 'E6' workbook['Your sheet'].add_image(img) workbook.save(outputExcelFile) os.remove('C:/Users/myplot.png')
Open a box to search a file and get his path
Use Tkinter.
from tkinter import Tk from tkinter.filedialog import askopenfilename Tk().withdraw() filename = askopenfilename() print(filename)
Open a box to save an Excel file
from tkinter import Tk from tkinter.filedialog import asksaveasfilename fileName = asksaveasfilename(filetypes=(("Excel files", "*.xlsx"), ("All files", "*.*")), defaultextension='.xslx') df.to_excel(fileName, index=False, sheet_name='Tab 1', header=None)
Check if a file exists
And use it only if it exists:
import os my_file = 'C:/_gh/0/export.xlsx' if os.path.isfile(my_file) == True: df_ExportFeuil1 = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl')
Rename a file
import os os.rename('My file.xlsx', 'My file renamed.xlsx')
Search duplicate columns in Excel
from openpyxl import load_workbook wb = load_workbook(filename=my_file) Feuil1 = wb['Feuil1'] # Récupérer les noms de colonnes dans la 3ème ligne noms_colonnes = [colonne.value for colonne in Feuil1[3]] print(noms_colonnes) # Vérifier les doublons dans les noms de colonnes doublons = set() noms_en_double = set() for nom_colonne in noms_colonnes: if nom_colonne in doublons: noms_en_double.add(nom_colonne) else: doublons.add(nom_colonne) # Afficher les noms de colonnes en doublons if noms_en_double: print(colored('Attention colonne en doublon dans le fichier Excel :', 'red')) print(colored(str(noms_en_double).replace('{', '').replace('}', ''), 'red')) sys.exit() else: print(colored('OK, pas de colonne en doublon dans le fichier Excel.', 'green'))