Pandas | Notion

df.dtypes # View the data types of columns
df.year.astype(float) # Convert column to new type
df.year = df.year.astype(float) # Assign converted type back to dataframe
pd.to_numeric(df.height,errors="coerce") # Coerce errors
df.year.min() # Call functions on series
df.agg(["min","max"])

(height - height.mean()) / height.std() # Standardize around 0

new_values = (height - height.min()) / (height.max() - height.min()) # Normalize between 0 and 1

df["normalized_column"] = new_values # Assign normalized values back to dataframe as a new column

df.height.transform(lambda x: x/10) # Transform a single column

df.groupby("artist").transform("nunique") # View data summary by group

df.groupby("artist")["height"].transform("mean") # Transform a single column grouped by another column

df.filter(items=["id","artist"]) # View only certain columns

df.filter(regex="(?i)year") # View columns that much a regex

df.filter(axis=0, like="100",case=False) # Switch the axis to filter rows

df.drop("id",axis=1) # Drop a single column
df.drop(columns=["height","width"]) # Drop several columns
df.drop("id",axis=1, inplace=True) # Drop a column inplace
df = pd.read_csv("file.csv",usecols=["artist","title"]) # Only import certain columns

df.columns.str.lower() # Lovercase all column names
[x.lower() for x in data.columns] # Use for...in
map(lambda x:x.lower(),data.columns) # Use map

### Indexing and Filtering Datasets

df["id"] # Access a column as a Series
df["id"[1] # Access a single row on a column
df[1:5] # Access a range of rows with a slice
df[data["year"] > 1800] # Use a basic filter

df.loc[ROWS,COLS] # Basic format of .loc
df.loc[0:2,:] # Access a slice of rows and all columns
df.loc[0:2,["title","artist"]] # Access a slice of rows and specific columns
data.loc[data.artist == "Blake, Robert",:] # Filter on rows, and select all columns

df.iloc[ROWS,COLS] # Basic format of .iloc
df.iloc[0:2,:] # Slice of rows by integer position end is exclusive
df.iloc[[1,5],[12,100]] # Access specific rows and specific columns by position

df.col.str.contains("search") # Contains string method 
df.loc[df.col.str.contains("search"), ["artist","title"]] # Filter data with contains and loc and select certain columns
df.loc[df.col.str.contains("one|two", case=False, regex=True),:] # Case insensitve regex search
df.col.astype(str).str.contains("search",na=False) # Convert column type, and ignore NaN values

## Handling Bad, Missing, and Duplicate Data
df.title.str.strip() # Strip whitespace from entire column
df.title.transform(lambda x:x.strip()) # Strip with a lambda function for greater flexibility

from numpy import nan 
df.replace({"colName":{"value":nan}}) # Import nan from numpy and replace all occurences of a specific value in a column
inplace = True # Remember to add inplace=True to change orginal data
df.loc[df.col=="value",["col"]] = nan # Filter with loc and fill with NaN

df.fillna(-1) # Fill all NaN values in entire dataset
df.fillna(value={"col":0}) # Fill NaN values in a specific column
inplace = True # use inplace=True to change orginal data

df.dropna() # Drop rows with ANY NaN values
df.dropna(how="all") # Drop rows with ALL NaN Values
df.dropna(thresh=15) # Drop rows with AT LEAST a certain number of NaN values
df.dropna(subset=["col1","col2"], inplace=True) # Only look at certain columns

df.drop_duplicates() # Drop all duplicates
df.drop_duplicates(subset=["col1"]) # Drop duplicates if they match across certain columns
data.drop_duplicates(keep=False) # Keep "first", "last" or False
data.loc[data.duplicated(subset=["col1","col2"], keep=False)] # Find and see duplicates using .loc across specific columns