Python数据操作

1. 时间

1.1 日期转化

import time, datetime
import pandas as pd

# 今天
today = datetime.datetime.today().date()

# 时间格式化
date_point_formatted = date_point.strftime('%Y%m%d')

# 字符串转化日期
date = '20190101'
date = datetime.datetime.strptime(str(date), '%Y%m%d').date()

# 各种结构转化为日期:integer, float, string, datetime, list, tuple, 1-d array, Series
date = pd.to_datetime(str(date), format='%Y%m%d')

1.2 日期加减、比较

import datetime

today = datetime.datetime.today().date()

# 五年前
delta_year = 5
start_date = datetime.date(today.year - delta_year, today.month, today.day)

# 七天前
day_delta = datetime.timedelta(days=-7)

# 大小比较
today > start_date

2. 基本操作

2.1字符串

# 转为字符串
price = 12
price = str(12)

# 转为int
price = int(price)

# 分割
text = "600690.SH"
tmp = text.split(".")
print(tmp[1]+tmp[0])

# 大小写
text = text.lower()
text = text.upper()

 2.2 列表

 1. 遍历 元素
for item in list:
 
2. 遍历 索引, 元素
for index, item in enumerate(list):

2.3 HTTP请求设置代理

r = requests.post(url, params=para, headers=header, proxies={
    'http': 'http://127.0.0.1:1086',
    'https': 'https://127.0.0.1:1086'
})

3. DataFrame

3.1 设置index

date_index = pd.to_datetime(df['trade_date'], format='%Y%m%d')
df.set_index(date_index, inplace=True)
df = df.sort_index(ascending=True)

 3.2 去重

# 去重
df = df.drop_duplicates(subset='trade_date', keep='first')

 3.3 拼接

# 拼接
df = df.append(old_df, ignore_index=True)

# merge on
df = pd.merge(income_df, balancesheet_df, on='end_date')

# merge
merged_df = pd.merge(stock_price_df, finance_df, left_index=True, right_index=True, how='left')

 3.4 遍历、删除

# 按index值进行遍历,并删除大于起始时间的
for index in df.index:
  print(index)
  if index.date() > start_date:
    df.drop(index=index)

# 按下标进行遍历
for index in range(0, len(pe_df), 1):
  line = pe_df.iloc[index]

# 直接遍历line
for index, line in df.iterrows():

 3.5 获取column中最大、最小值

max_line = data.loc[data['profitRate'].idxmax()]
min_line = data.loc[data['profitRate'].idxmin()]

 3.6 cell操作

# 字符串化
df['trade_date'] = df['trade_date'].apply(str)

# 日志操作
df['end_date'] = df['end_date'].apply(lambda x: x[2:6])

# 单元格操作
df['trade_date'] = df['trade_date'].apply(lambda x: ajust_date_to_quarter(x))

# 读取[row, column]的内容
df.at[row, column]

 3.7 loc/iloc区别

# 下标方式获取df段
data = pe_df[index: index + 480]

# loc根据index获取,index_value: 真实值,比如20190101、20190102
line = pe_df.loc[index_value]

# iloc根据下标获取,index: 0、1、2、3
line = pe_df.iloc[index]

DataFrame Selection
https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#indexing-selection

 3.8 保存、加载

# 写文件
df.to_csv(file_path, index=False)

# 保持 unicode 编码
1. df.to_json('./external-resources/enemy-data.json'), orient = "records", force_ascii=False)
2. df.to_json(open('./external-resources/enemy-data.json', 'w', encoding='utf-8'), orient = "records", force_ascii=False)


# 读文件
df = pd.read_csv(dir_path+index_code)

# 第一列作为index
df = pd.read_csv(file_path, index_col=0)

3.9 日期操作

from pandas.tseries.offsets import *
today = '20190101'
today = pd.Timestamp(today)
# 本季度末
this_quarter = (today - QuarterEnd(n=0))
print(this_quarter.strftime("%Y%m%d"))
# 上一个季度末
last_quarter = (today - QuarterEnd(n=1))
print(last_quarter.strftime("%Y%m%d"))

3.9 其它

df = df[['total_revenue', 'total_profit', 'n_income']]
df = df.fillna(value=0)
df = df.round(2)
# 重命名
df = df.rename(columns={'n_income': '净利润', 'total_profit': '总利润'})

# 插入
finance_df.insert(0, '类目', finance_df.index)

# 所有column名称
finance_df.columns.values.tolist()

# group
df_max = df.groupby('trade_date').max().rename(columns={'close': 'max'})

# 转置
finance_df = finance_df.transpose()

# 是否存在某个row
result = row_name in analysed_stocks.index

4. 异常

4.1 异常栈

import traceback

try:
  
except Exception as e:
  traceback.print_exc()