data


data


import pandas as pd
# import lxml - required for the download
import numpy as np


page = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'
wikitables = pd.read_html(page)


len(wikitables)

92


wikitables[0].head()


wikitables = pd.read_html(
    page,
    attrs={"class":"wikitable sortable plainrowheaders"},
    index_col='Rank',
)
wikitables[0].head()


wikitables = pd.read_html(page)
dataframe = wikitables[0].copy()


dataframe.columns

Index(['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year', 'Reference(s)'], dtype='object')


dataframe['Year'].head()

0    2019
1    2009
2    1997
3    2015
4    2018
Name: Year, dtype: int64


dataframe['Year'].head() * 100

0    201900
1    200900
2    199700
3    201500
4    201800
Name: Year, dtype: int64


a = pd.Series(
    [1, 2, 3],
    index=['a', 'b', 'c'],
)
a

a    1
b    2
c    3
dtype: int64


b = pd.Series(
    [5, 6, 7],
    index=['c', 'a', 'b'],
)
b

c    5
a    6
b    7
dtype: int64

a+b

a    7
b    9
c    8
dtype: int64


a = pd.Series(
    [1, 2, 3], 
    index=['a', 'b', 'c'],
)
a>1

a    False
b     True
c     True
dtype: bool


a[a>1]

b    2
c    3
dtype: int64


a = pd.Series(
    [1, 2, 3, 4, 5, 6],
    index=['a', 'b', 'c', 'd', 'e', 'f'],
)
idx_1 = a>2
idx_2 = a<5
a[idx_1 & idx_2]

c    3
d    4
dtype: int64


a['a']

1


a['a':'b']

a    1
b    2
dtype: int64


b = pd.Series([5, 6, 7], index=['c', 'a', 'b'])
b['c':'a']

c    5
a    6
dtype: int64


del dataframe['Reference(s)']


dataframe.head()


dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     int64 
 1   Peak             50 non-null     object
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     object
 4   Year             50 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1.4+ KB


dataframe.select_dtypes(include='int64').head()


dataframe.select_dtypes(include=np.number).head()


dataframe['Title'].head()

0               Avengers: Endgame
1                          Avatar
2                         Titanic
3    Star Wars: The Force Awakens
4          Avengers: Infinity War
Name: Title, dtype: object


dataframe.sort_values('Year').head()


dataframe.sort_values('Year', ascending=False).head()


dataframe.query("Year>=2017")


selection = dataframe['Year']>=2017
selection.head()

0     True
1    False
2    False
3    False
4     True
Name: Year, dtype: bool


dataframe[selection]


c = 'Worldwide gross'
dataframe[c].str.split('$').head()

0    [, 2,797,800,564]
1    [, 2,790,439,000]
2    [, 2,194,439,542]
3    [, 2,068,223,624]
4    [, 2,048,359,754]
Name: Worldwide gross, dtype: object


dataframe[c].str.split('$').str[-1].head()

0    2,797,800,564
1    2,790,439,000
2    2,194,439,542
3    2,068,223,624
4    2,048,359,754
Name: Worldwide gross, dtype: object


dataframe[c] = dataframe[c].str.split('$').str[-1]
dataframe[c] = dataframe[c].str.replace(',', '')
dataframe[c] = dataframe[c].astype(np.int64)


dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     int64 
 1   Peak             50 non-null     object
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     int64 
 4   Year             50 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 1.6+ KB


try:
    dataframe['Peak'].astype(int)
except ValueError as e:
    print(e)

invalid literal for int() with base 10: '4TS3'


print(list(dataframe['Peak'].unique()))

['1', '3', '4', '7', '10', '5', '9', '12', '15', '11', '20', '2', '24', '23', '31', '32', '30', '4TS3', '34', '6', '22', '14', '2PS', '19DM2']


import re
regex = re.compile('(\d+)\D*\d*')
regex = re.compile(
    '(\d+)' # extract this group, composed of one or more digits
    '\D*' # can be followed by 0 or more non digits
    '\d*' # can be followed by 0 or more digits
)
dataframe['Peak'] = dataframe['Peak'].str.extract(regex, expand=False)
dataframe['Peak'] = dataframe['Peak'].astype(int)


dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     int64 
 1   Peak             50 non-null     int32 
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     int64 
 4   Year             50 non-null     int64 
dtypes: int32(1), int64(3), object(1)
memory usage: 1.6+ KB


dataframe['Rank'] = dataframe['Rank'].astype(np.int64)


dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Rank             50 non-null     int64 
 1   Peak             50 non-null     int32 
 2   Title            50 non-null     object
 3   Worldwide gross  50 non-null     int64 
 4   Year             50 non-null     int64 
dtypes: int32(1), int64(3), object(1)
memory usage: 1.6+ KB


dataframe.describe()


import pylab as plt


plt.scatter('Rank', 'Peak', data=dataframe)

<matplotlib.collections.PathCollection at 0x1925370>


dataframe.plot.scatter('Rank', 'Peak')

<matplotlib.axes._subplots.AxesSubplot at 0x19af510>


plt.plot('Rank', 'Worldwide gross', data=dataframe)

[<matplotlib.lines.Line2D at 0x4b9b710>]


with plt.xkcd():
    dataframe.plot('Rank', 'Worldwide gross')


plt.hist('Worldwide gross', data=dataframe);


dataframe['Worldwide gross'].plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x4cbb9d0>


dataframe['Worldwide gross'].plot.hist()
ax = plt.gca()
fig = plt.gcf()
ax.set_title("Histogram of Worldwide gross", fontsize=25, y=1.05)
fig.tight_layout()


wiki = "https://it.wikipedia.org/wiki/"
url_popolazione = wiki + "Comuni_d%27Italia_per_popolazione"
url_superficie = wiki + "Primi_100_comuni_italiani_per_superficie"


comuni_popolazione = pd.read_html(url_popolazione, header=0)
comuni_popolazione = comuni_popolazione[0]
comuni_popolazione.head()


comuni_superficie = pd.read_html(url_superficie, header=0)
# might change, right now the 0 it's a info box
comuni_superficie = comuni_superficie[1] 
comuni_superficie.head()


comuni_superficie.groupby('Regione').mean()


g = comuni_superficie.groupby('Regione')
g.aggregate([np.mean, np.std, pd.Series.count])


comuni_superficie.groupby('Regione')['Superficie (km²)'].count()

Regione
Abruzzo                 1
Basilicata              3
Calabria                3
Emilia-Romagna         10
Lazio                   4
Liguria                 1
Lombardia               2
Marche                  3
Puglia                 19
Sardegna                8
Sicilia                20
Toscana                14
Trentino-Alto Adige     2
Umbria                  7
Veneto                  3
Name: Superficie (km²), dtype: int64


g = comuni_superficie.groupby('Regione')['Superficie (km²)']
g.count().sort_values(ascending=False)

Regione
Sicilia                20
Puglia                 19
Toscana                14
Emilia-Romagna         10
Sardegna                8
Umbria                  7
Lazio                   4
Veneto                  3
Marche                  3
Calabria                3
Basilicata              3
Trentino-Alto Adige     2
Lombardia               2
Liguria                 1
Abruzzo                 1
Name: Superficie (km²), dtype: int64


g = comuni_popolazione.groupby('Regione')['Abitanti']
g.count().sort_values(ascending=False)

Regione
Campania                 19
Sicilia                  16
Lombardia                15
Puglia                   15
Toscana                  13
Emilia-Romagna           13
Lazio                    11
Calabria                  6
Veneto                    6
Piemonte                  6
Abruzzo                   5
Liguria                   4
Sardegna                  4
Umbria                    3
Marche                    3
Friuli-Venezia Giulia     3
Trentino-Alto Adige       2
Basilicata                2
Name: Abitanti, dtype: int64


a = pd.DataFrame(
    [
        ('Antonio', 'M'),
        ('Marco', 'M'),
        ('Francesca', 'F'),
        ('Giulia', 'F'),
    ], 
    columns = ['name', 'sex'])

b = pd.DataFrame(
    [
        ('Antonio', 15),
        ('Marco', 10),
        ('Marco', 12),
        ('Carlo', 23),
        ('Francesca', 20),
    ], 
    columns = ['name', 'expenses'])

a

b


pd.merge(a, b, on='name', how='left')


pd.merge(a, b, on='name', how='left', validate="1:m")


# if we validate with one to one it does fail
pd.merge(a, b, on='name', how='left', validate="1:1")

---------------------------------------------------------------------------
MergeError                                Traceback (most recent call last)
<ipython-input-14-c1ff55754590> in <module>
      1 # if we validate with one to one it does fail
----> 2 pd.merge(a, b, on='name', how='left', validate="1:1")

~/miniconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     72     validate=None,
     73 ) -> "DataFrame":
---> 74     op = _MergeOperation(
     75         left,
     76         right,

~/miniconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
    676         # are in fact unique.
    677         if validate is not None:
--> 678             self._validate(validate)
    679 
    680     def get_result(self):

~/miniconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _validate(self, validate)
   1364                 )
   1365             elif not right_unique:
-> 1366                 raise MergeError(
   1367                     "Merge keys are not unique in right dataset; not a one-to-one merge"
   1368                 )

MergeError: Merge keys are not unique in right dataset; not a one-to-one merge


pd.merge(a, b, on='name', how='right')


pd.merge(a, b, on='name', how='inner')


pd.merge(a, b, on='name', how='outer')


_ = pd.merge(
    comuni_popolazione,
    comuni_superficie,
    on=['Comune', 'Regione'],
)
_.head()


len(pd.merge(
    comuni_popolazione, 
    comuni_superficie,
    on='Comune',
    how='right',
))

100


len(pd.merge(
    comuni_popolazione, 
    comuni_superficie,
    on='Comune',
    how='left',
))

146


len(pd.merge(
    comuni_popolazione,
    comuni_superficie,
    on='Comune',
    how='inner',
))

37


len(pd.merge(
    comuni_popolazione,
    comuni_superficie,
    on='Comune',
    how='outer',
))

209


spese = [
    ('Antonio', 'cat', 4),
    ('Antonio', 'cat', 5),
    ('Antonio', 'cat', 6),

    ('Giulia', 'cat', 3),
    ('Giulia', 'dog', 7),
    ('Giulia', 'dog', 8),
]

spese = pd.DataFrame(spese, columns = ['name', 'animal', 'expenses'])
spese


pd.pivot_table(
    spese,
    index='name',
    columns='animal',
    values='expenses',
    aggfunc=np.sum,
)


pd.pivot_table(
    spese,
    index='name',
    columns='animal',
    values='expenses',
    aggfunc=np.sum,
    fill_value=0,
)


pd.pivot_table(
    spese,
    index='name',
    columns='animal',
    values='expenses',
    aggfunc=np.sum,
    fill_value=0,
    margins=True,
)


pd.pivot_table(
    spese,
    index='name',
    columns='animal',
    values='expenses',
    aggfunc=pd.Series.count,
    fill_value=0,
)


r = pd.pivot_table(
    spese,
    index='name',
    columns='animal',
    values='expenses',
    aggfunc=pd.Series.count,
    fill_value=0,
)
r = r.reset_index()
r


v = pd.melt(r, id_vars=['name'], value_vars=['dog', 'cat'])
v


v2 = v.set_index(['name', 'animal'])['value']
v2

name     animal
Antonio  dog       0
Giulia   dog       2
Antonio  cat       3
Giulia   cat       1
Name: value, dtype: int64


v2.unstack()


v2.unstack().stack()

name     animal
Antonio  cat       3
         dog       0
Giulia   cat       1
         dog       2
dtype: int64


v.pivot(index='name', columns='animal', values='value')


url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
iris = pd.read_csv(url)


iris.head()


iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 5.3+ KB


import seaborn


from matplotlib import style
print(sorted(style.available))
style.use('default')

['Solarize_Light2', '_classic_test', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']


seaborn.lmplot(
    'sepal_length',
    'sepal_width',
    data=iris,
    hue='species',
)

<seaborn.axisgrid.FacetGrid at 0xe956bd0>


seaborn.lmplot(
    'sepal_length',
    'sepal_width',
    data=iris,
    col='species',
)

<seaborn.axisgrid.FacetGrid at 0xe97e470>


seaborn.lmplot(
    'sepal_length',
    'sepal_width',
    data=iris,
    row='species',
)

<seaborn.axisgrid.FacetGrid at 0xeaa34b0>


fg = seaborn.FacetGrid(
    data=iris,
    col='species',
    hue='species',
    height=4,
    aspect=0.9,
)
fg.map(
    plt.scatter,
    'sepal_length',
    'sepal_width', 
    s=50,
)
fg.map(
    seaborn.regplot,
    'sepal_length',
    'sepal_width', 
    scatter=False,
)
fg.add_legend();


seaborn.jointplot(
    'sepal_length',
    'sepal_width', 
    data=iris,
    kind="kde",
);


seaborn.boxplot(
    'species',
    'sepal_length',
    data=iris,
)

<matplotlib.axes._subplots.AxesSubplot at 0x10be14d0>


seaborn.pairplot(iris, hue="species")

<seaborn.axisgrid.PairGrid at 0x10c30e70>


g = seaborn.PairGrid(
    iris,
    height=2,
    hue='species',
)
g.map_diag(plt.hist, alpha=0.5)
g.map_offdiag(plt.scatter, alpha=0.75, s=20);


data = [{'label': 'DRUG', 'pattern': 'aspirin'},
        {'label': 'DRUG', 'pattern': 'trazodone'},
        {'label': 'DRUG', 'pattern': 'citalopram'}]
df = pd.DataFrame(data)
print(df)

  label     pattern
0  DRUG     aspirin
1  DRUG   trazodone
2  DRUG  citalopram


# Output in JSONL format
jsonlines = df.to_json(orient='records', lines=True)
print(jsonlines)

{"label":"DRUG","pattern":"aspirin"}
{"label":"DRUG","pattern":"trazodone"}
{"label":"DRUG","pattern":"citalopram"}


df_clone = pd.read_json(jsonlines, lines=True)
print(df_clone)

  label     pattern
0  DRUG     aspirin
1  DRUG   trazodone
2  DRUG  citalopram


data = [{'label': 'DRUG', 'pattern': 'aspirin'},
        {'label': 'DRUG', 'pattern': 'trazodone'},
        {'label': 'DRUG', 'pattern': 'citalopram'}]
df = pd.DataFrame(data)
print(df.style.to_latex())

\begin{tabular}{lll}
 & label & pattern \\
0 & DRUG & aspirin \\
1 & DRUG & trazodone \\
2 & DRUG & citalopram \\
\end{tabular}


print(df.style.to_html())

<style type="text/css">
</style>
<table id="T_fdcf4">
  <thead>
    <tr>
      <th class="blank level0" >&nbsp;</th>
      <th id="T_fdcf4_level0_col0" class="col_heading level0 col0" >label</th>
      <th id="T_fdcf4_level0_col1" class="col_heading level0 col1" >pattern</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th id="T_fdcf4_level0_row0" class="row_heading level0 row0" >0</th>
      <td id="T_fdcf4_row0_col0" class="data row0 col0" >DRUG</td>
      <td id="T_fdcf4_row0_col1" class="data row0 col1" >aspirin</td>
    </tr>
    <tr>
      <th id="T_fdcf4_level0_row1" class="row_heading level0 row1" >1</th>
      <td id="T_fdcf4_row1_col0" class="data row1 col0" >DRUG</td>
      <td id="T_fdcf4_row1_col1" class="data row1 col1" >trazodone</td>
    </tr>
    <tr>
      <th id="T_fdcf4_level0_row2" class="row_heading level0 row2" >2</th>
      <td id="T_fdcf4_row2_col0" class="data row2 col0" >DRUG</td>
      <td id="T_fdcf4_row2_col1" class="data row2 col1" >citalopram</td>
    </tr>
  </tbody>
</table>


expenses = [
    ('antonio', 'cat', 4),
    ('antonio', 'cat', 5),
    ('antonio', 'cat', 6),

    ('giulia', 'cat', 3),
    ('giulia', 'dog', 7),
    ('giulia', 'dog', 8),
]

expenses = pd.DataFrame(expenses, columns = ['name', 'animal', 'expenses'])
expenses


# can format the content with format minilanguage and functions
expenses.style.format({"expenses": "{:.2f} €", "name": str.title})


# can apply generic functions (there is a wide list)
expenses.style.highlight_max(color='coral', subset=["expenses"])


#can apply a generic function cell-wise, column-wise, etc...
def highlight_median(x, color):
    import numpy as np
    median =  np.median(x.to_numpy())
    style_to_apply = f"color: {color};"
    return np.where(x>median, style_to_apply, None)

expenses.style.apply(highlight_median, color='orange', subset=["expenses"])


# can export the styling to latex and html
latex_result = (
    expenses.style
    .format({"expenses": "{:.2f} €", "name": str.title})
    .highlight_max(
        subset=["expenses"],
        props=(
        'cellcolor:[HTML]{FFFF00}; '
        'color:{red}; '
        'bfseries: ; '
        )
    )
    .set_caption("some info about pets")
    .to_latex()
)
print(latex_result)

\begin{table}
\caption{some info about pets}
\begin{tabular}{lllr}
 & name & animal & expenses \\
0 & Antonio & cat & 4.00 € \\
1 & Antonio & cat & 5.00 € \\
2 & Antonio & cat & 6.00 € \\
3 & Giulia & cat & 3.00 € \\
4 & Giulia & dog & 7.00 € \\
5 & Giulia & dog & \cellcolor[HTML]{FFFF00} \color{red} \bfseries 8.00 € \\
\end{tabular}
\end{table}

	age	height	sex
name
Andrea	24	178	Male
Maria	33	154	Female
Luca	30	175	Male

	nome	anno	tipologia	città	indirizzo
0	Andrea	2015	residenza	Rimini	via stretta 20
1	Andrea	2015	domicilio	Bologna	via larga 30
2	Andrea	2016	residenza	Rimini	via stretta 20
3	Andrea	2016	domicilio	Bologna	via larga 30
4	Giulio	2015	residenza	Rimini	via giusta 50
5	Giulio	2015	domicilio	Bologna	via falsa 40
6	Giulio	2016	residenza	Bologna	via torna 10
7	Giulio	2016	domicilio	Bologna	via torna 10

	Rank	Peak	Title	Worldwide gross	Year
39	40	1	Jurassic Park	$1,029,939,903	1993
48	49	2	The Lion King	$968,483,777	1994
2	3	1	Titanic	$2,194,439,542	1997
41	42	2	Star Wars: Episode I – The Phantom Menace	$1,027,044,677	1999
46	47	2PS	Harry Potter and the Philosopher's Stone	$975,051,288	2001

	Rank	Peak	Worldwide gross	Year
count	50.00000	50.000000	5.000000e+01	50.000000
mean	25.50000	10.980000	1.291294e+09	2012.960000
std	14.57738	9.725812	4.193347e+08	6.639277
min	1.00000	1.000000	9.665506e+08	1993.000000
25%	13.25000	3.250000	1.046959e+09	2011.000000
50%	25.50000	7.000000	1.130101e+09	2015.000000
75%	37.75000	18.000000	1.339405e+09	2018.000000
max	50.00000	34.000000	2.797801e+09	2019.000000

	N°	Comune	Regione	Provincia / Città metropolitana	Abitanti
0	1	Roma	Lazio	Roma	2 847 490
1	2	Milano	Lombardia	Milano	1 388 223
2	3	Napoli	Campania	Napoli	955 503
3	4	Torino	Piemonte	Torino	875 063
4	5	Palermo	Sicilia	Palermo	659 052

DataFrame in Pandas¶

python: Pandas DataFrames ¶

R: dataframes ¶

Matlab: Table ¶

Julia: DataFrames ¶

Introduction to Pandas¶

Series¶

Series as elements of a DataFrame¶

Dataframe manipulation¶

DataFrame visualization¶

Multidimensional dataframes using XArray¶

Manipulation on DataFrames¶

Groupby¶

Join and Merge¶

left and right join¶

inner and outer join¶

Pivot, melt and stacking¶

Data Panel visualization¶

pandas + matplotlib = seaborn¶

exporting a DataFrame¶

data export¶

Reporting and styling¶

Exercise¶

https://www.britannica.com/topic/Nobel-Prize-Winners-by-Year-1856946 ¶

https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita ¶

	Rank	Peak	Title	Worldwide gross	Year	Reference(s)
0	1	1	Avengers: Endgame	$2,797,800,564	2019	[# 1][# 2]
1	2	1	Avatar	$2,790,439,000	2009	[# 3][# 4]
2	3	1	Titanic	$2,194,439,542	1997	[# 5][# 6]
3	4	3	Star Wars: The Force Awakens	$2,068,223,624	2015	[# 7][# 8]
4	5	4	Avengers: Infinity War	$2,048,359,754	2018	[# 9][# 10]

	Pos.	Superficie (km²)
Regione
Abruzzo	9.000000	47391.000000
Basilicata	54.666667	29641.000000
Calabria	52.333333	28937.666667
Emilia-Romagna	56.200000	31154.100000
Lazio	29.250000	56263.750000
Liguria	80.000000	24029.000000
Lombardia	94.500000	22701.500000
Marche	84.333333	24168.000000
Puglia	46.210526	33034.263158
Sardegna	62.625000	29839.375000
Sicilia	46.300000	32300.150000
Toscana	52.142857	29941.357143
Trentino-Alto Adige	56.000000	27485.000000
Umbria	32.142857	36175.285714
Veneto	49.000000	30853.000000

	name	sex	expenses
0	Antonio	M	15.0
1	Marco	M	10.0
2	Marco	M	12.0
3	Francesca	F	20.0
4	Giulia	F	NaN

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	name	animal	expenses
0	Antonio	cat	4.00 €
1	Antonio	cat	5.00 €
2	Antonio	cat	6.00 €
3	Giulia	cat	3.00 €
4	Giulia	dog	7.00 €
5	Giulia	dog	8.00 €

	Rank	Peak	Title	Worldwide gross	Year
0	1	1	Avengers: Endgame	$2,797,800,564	2019
4	5	4	Avengers: Infinity War	$2,048,359,754	2018
6	7	7	The Lion King	$1,656,943,394	2019
9	10	10	Frozen II	$1,450,026,933	2019
11	12	9	Black Panther	$1,346,913,161	2018
13	14	9	Star Wars: The Last Jedi	$1,332,539,889	2017
14	15	12	Jurassic World: Fallen Kingdom	$1,309,484,461	2018
16	17	10	Beauty and the Beast	$1,263,521,126	2017
17	18	15	Incredibles 2	$1,242,805,359	2018
18	19	11	The Fate of the Furious	F8$1,238,764,765	2017
22	23	20	Aquaman	$1,148,161,807	2018
24	25	24	Spider-Man: Far From Home	$1,131,927,996	2019
25	26	23	Captain Marvel	$1,128,274,794	2019
30	31	31	Joker	$1,074,251,311	2019
31	32	32	Star Wars: The Rise of Skywalker	$1,074,144,248	2019
32	33	30	Toy Story 4	$1,073,394,593	2019
36	37	34	Aladdin	$1,050,693,953	2019
38	39	24	Despicable Me 3	$1,034,799,409	2017

	Rank	Peak	Title	Worldwide gross	Year
0	1	1	Avengers: Endgame	$2,797,800,564	2019
4	5	4	Avengers: Infinity War	$2,048,359,754	2018
6	7	7	The Lion King	$1,656,943,394	2019
9	10	10	Frozen II	$1,450,026,933	2019
11	12	9	Black Panther	$1,346,913,161	2018
13	14	9	Star Wars: The Last Jedi	$1,332,539,889	2017
14	15	12	Jurassic World: Fallen Kingdom	$1,309,484,461	2018
16	17	10	Beauty and the Beast	$1,263,521,126	2017
17	18	15	Incredibles 2	$1,242,805,359	2018
18	19	11	The Fate of the Furious	F8$1,238,764,765	2017
22	23	20	Aquaman	$1,148,161,807	2018
24	25	24	Spider-Man: Far From Home	$1,131,927,996	2019
25	26	23	Captain Marvel	$1,128,274,794	2019
30	31	31	Joker	$1,074,251,311	2019
31	32	32	Star Wars: The Rise of Skywalker	$1,074,144,248	2019
32	33	30	Toy Story 4	$1,073,394,593	2019
36	37	34	Aladdin	$1,050,693,953	2019
38	39	24	Despicable Me 3	$1,034,799,409	2017

	Pos.	Comune	Regione	Provincia	Superficie (km²)
0	1	Roma	Lazio	Roma	128736
1	2	Ravenna	Emilia-Romagna	Ravenna	65382
2	3	Cerignola	Puglia	Foggia	59393
3	4	Noto	Sicilia	Siracusa	55499
4	5	Sassari	Sardegna	Sassari	54704

	N°	Comune	Regione	Provincia / Città metropolitana	Abitanti	Pos.	Provincia	Superficie (km²)
0	1	Roma	Lazio	Roma	2 847 490	1	Roma	128736
1	6	Genova	Liguria	Genova	575 577	80	Genova	24029
2	11	Venezia	Veneto	Venezia	259 736	15	Venezia	41590
3	17	Parma	Emilia-Romagna	Parma	197 478	62	Parma	26060
4	18	Taranto	Puglia	Taranto	195 279	71	Taranto	24986

DataFrame in Pandas¶

python: Pandas DataFrames¶

R: dataframes¶

Matlab: Table¶

Julia: DataFrames¶

Introduction to Pandas¶

Series¶

Series as elements of a DataFrame¶

Dataframe manipulation¶

DataFrame visualization¶

Multidimensional dataframes using XArray¶

Manipulation on DataFrames¶

Groupby¶

Join and Merge¶

left and right join¶

inner and outer join¶

Pivot, melt and stacking¶

Data Panel visualization¶

pandas + matplotlib = seaborn¶

exporting a DataFrame¶

data export¶

Reporting and styling¶

Exercise¶

https://www.britannica.com/topic/Nobel-Prize-Winners-by-Year-1856946¶

https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita¶

python: Pandas DataFrames ¶

R: dataframes ¶

Matlab: Table ¶

Julia: DataFrames ¶

https://www.britannica.com/topic/Nobel-Prize-Winners-by-Year-1856946 ¶

https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita ¶