Python数据操作
1. 时间
1.1 日期转化
import time, datetime
import pandas as pd
# 今天
today = datetime.datetime.today().date()
# 时间格式化
date_point_formatted = date_point.strftime('%Y%m%d')
# 字符串转化日期
date = '20190101'
date = datetime.datetime.strptime(str(date), '%Y%m%d').date()
# 各种结构转化为日期:integer, float, string, datetime, list, tuple, 1-d array, Series
date = pd.to_datetime(str(date), format='%Y%m%d')
1.2 日期加减、比较
import datetime
today = datetime.datetime.today().date()
# 五年前
delta_year = 5
start_date = datetime.date(today.year - delta_year, today.month, today.day)
# 七天前
day_delta = datetime.timedelta(days=-7)
# 大小比较
today > start_date
2. 基本操作
2.1字符串
# 转为字符串
price = 12
price = str(12)
# 转为int
price = int(price)
# 分割
text = "600690.SH"
tmp = text.split(".")
print(tmp[1]+tmp[0])
# 大小写
text = text.lower()
text = text.upper()
2.2 列表
1. 遍历 元素
for item in list:
2. 遍历 索引, 元素
for index, item in enumerate(list):
2.3 HTTP请求设置代理
r = requests.post(url, params=para, headers=header, proxies={
'http': 'http://127.0.0.1:1086',
'https': 'https://127.0.0.1:1086'
})
3. DataFrame
3.1 设置index
date_index = pd.to_datetime(df['trade_date'], format='%Y%m%d')
df.set_index(date_index, inplace=True)
df = df.sort_index(ascending=True)
3.2 去重
# 去重
df = df.drop_duplicates(subset='trade_date', keep='first')
3.3 拼接
# 拼接
df = df.append(old_df, ignore_index=True)
# merge on
df = pd.merge(income_df, balancesheet_df, on='end_date')
# merge
merged_df = pd.merge(stock_price_df, finance_df, left_index=True, right_index=True, how='left')
3.4 遍历、删除
# 按index值进行遍历,并删除大于起始时间的
for index in df.index:
print(index)
if index.date() > start_date:
df.drop(index=index)
# 按下标进行遍历
for index in range(0, len(pe_df), 1):
line = pe_df.iloc[index]
# 直接遍历line
for index, line in df.iterrows():
3.5 获取column中最大、最小值
max_line = data.loc[data['profitRate'].idxmax()]
min_line = data.loc[data['profitRate'].idxmin()]
3.6 cell操作
# 字符串化
df['trade_date'] = df['trade_date'].apply(str)
# 日志操作
df['end_date'] = df['end_date'].apply(lambda x: x[2:6])
# 单元格操作
df['trade_date'] = df['trade_date'].apply(lambda x: ajust_date_to_quarter(x))
# 读取[row, column]的内容
df.at[row, column]
3.7 loc/iloc区别
# 下标方式获取df段
data = pe_df[index: index + 480]
# loc根据index获取,index_value: 真实值,比如20190101、20190102
line = pe_df.loc[index_value]
# iloc根据下标获取,index: 0、1、2、3
line = pe_df.iloc[index]
https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#indexing-selection
3.8 保存、加载
# 写文件
df.to_csv(file_path, index=False)
# 保持 unicode 编码
1. df.to_json('./external-resources/enemy-data.json'), orient = "records", force_ascii=False)
2. df.to_json(open('./external-resources/enemy-data.json', 'w', encoding='utf-8'), orient = "records", force_ascii=False)
# 读文件
df = pd.read_csv(dir_path+index_code)
# 第一列作为index
df = pd.read_csv(file_path, index_col=0)
3.9 日期操作
from pandas.tseries.offsets import *
today = '20190101'
today = pd.Timestamp(today)
# 本季度末
this_quarter = (today - QuarterEnd(n=0))
print(this_quarter.strftime("%Y%m%d"))
# 上一个季度末
last_quarter = (today - QuarterEnd(n=1))
print(last_quarter.strftime("%Y%m%d"))
3.9 其它
df = df[['total_revenue', 'total_profit', 'n_income']]
df = df.fillna(value=0)
df = df.round(2)
# 重命名
df = df.rename(columns={'n_income': '净利润', 'total_profit': '总利润'})
# 插入
finance_df.insert(0, '类目', finance_df.index)
# 所有column名称
finance_df.columns.values.tolist()
# group
df_max = df.groupby('trade_date').max().rename(columns={'close': 'max'})
# 转置
finance_df = finance_df.transpose()
# 是否存在某个row
result = row_name in analysed_stocks.index
4. 异常
4.1 异常栈
import traceback
try:
except Exception as e:
traceback.print_exc()