#!/usr/bin/python
#coding=utf-8

探索OSM的文件格式(Node对象)。

在线获取OpenStreetMap区域地图数据,转为GeoPandas,并最终转为shp格式文件。

by [email protected]

OSM为xml格式,解析文件结构使用强大的requests数据下载包,网页和xml分析神器BeautifulSoup。

**注意:**由于BeautifulSoup将数据读到内存处理,因此不适合大数据量的处理。
from bs4 import BeautifulSoup as bs
import requests as req
from pprint import *

直接下载到内存。不推荐使用,因为如果网络中断,需要重新下载。

url = "http://api.openstreetmap.org/api/0.6/map?bbox=11.54,48.14,11.543,48.145"
try:
    r = req.get(url)
    print(r)
except Exception as ex:
    print("Error:",ex)

使用wget -c下载OSM数据,保存到本地文件,然后载入。

!wget -c -O osm_test.osm "http://api.openstreetmap.org/api/0.6/map?bbox=11.54,48.14,11.543,48.145"
--2016-05-04 14:59:47--  http://api.openstreetmap.org/api/0.6/map?bbox=11.54,48.14,11.543,48.145
正在解析主机 api.openstreetmap.org (api.openstreetmap.org)... 193.63.75.99, 193.63.75.100, 193.63.75.103, ...
正在连接 api.openstreetmap.org (api.openstreetmap.org)|193.63.75.99|:80... 已连接。
已发出 HTTP 请求,正在等待回应... 200 OK
长度: 未指定 [text/xml]
正在保存至: “osm_test.osm”

osm_test.osm            [        <=>         ]   2.67M  87.2KB/s    in 39s     

2016-05-04 15:00:27 (71.0 KB/s) - “osm_test.osm” 已保存 [2799533]

查看文件列表。可到当前目录去查看内容,由于文件较大,不要在本页面直接打开。

!ls -l -h
总用量 2.8M
-rw-rw-r-- 1 supermap supermap  24K 5月   4 15:02 osm-discovery.ipynb
-rw-rw-r-- 1 supermap supermap 5.0K 4月  24 17:45 osm-overpass.ipynb
-rw-rw-r-- 1 supermap supermap  15K 4月  23 08:23 osm-tag2json.ipynb
-rw-rw-r-- 1 supermap supermap 2.7M 5月   4 15:00 osm_test.osm

直接读取本地文件,获得范围信息。

#bsr = bs(atext,"lxml")
bsr = bs(open("osm_test.osm"),"lxml")

mbr = bsr.find_all('bounds')
print(mbr)
[<bounds maxlat="48.1450000" maxlon="11.5430000" minlat="48.1400000" minlon="11.5400000"></bounds>]

获得osm文件中所有的node对象。

nodelist = bsr.find_all('node')

print("All Nodes:",len(nodelist),", list 0-5:")
pprint(nodelist[0:5])
All Nodes: 1864 , list 0-5:
[<node changeset="34651972" id="398692" lat="48.1452196" lon="11.5414971" timestamp="2015-10-15T10:53:28Z" uid="2290263" user="soemisch" version="20" visible="true">
<tag k="tmc" v="DE:35375"></tag>
</node>,
 <node changeset="34904180" id="1956100" lat="48.1434822" lon="11.5487963" timestamp="2015-10-27T14:01:37Z" uid="2385132" user="MENTZ_TU" version="43" visible="true">
<tag k="tmc" v="DE:61453"></tag>
<tag k="TMC:cid_58:tabcd_1:Class" v="Point"></tag>
<tag k="TMC:cid_58:tabcd_1:Direction" v="positive"></tag>
<tag k="TMC:cid_58:tabcd_1:LCLversion" v="9.00"></tag>
<tag k="TMC:cid_58:tabcd_1:LocationCode" v="35356"></tag>
<tag k="TMC:cid_58:tabcd_1:NextLocationCode" v="35357"></tag>
<tag k="TMC:cid_58:tabcd_1:PrevLocationCode" v="35355"></tag>
</node>,
 <node changeset="10842011" id="21565151" lat="48.1414994" lon="11.5522715" timestamp="2012-03-01T20:37:08Z" uid="342705" user="KonB" version="4" visible="true"></node>,
 <node changeset="9695595" id="21585828" lat="48.1445431" lon="11.5384205" timestamp="2011-10-30T16:47:12Z" uid="534662" user="Q12" version="17" visible="true"></node>,
 <node changeset="9883923" id="60300474" lat="48.1406915" lon="11.5502820" timestamp="2011-11-20T13:24:04Z" uid="64536" user="Michael Forster" version="4" visible="true"></node>]

查看node的数据结构。

node = nodelist[0]
print(node)
<node changeset="34651972" id="398692" lat="48.1452196" lon="11.5414971" timestamp="2015-10-15T10:53:28Z" uid="2290263" user="soemisch" version="20" visible="true">
<tag k="tmc" v="DE:35375"></tag>
</node>
node.attrs
{'changeset': '34651972',
 'id': '398692',
 'lat': '48.1452196',
 'lon': '11.5414971',
 'timestamp': '2015-10-15T10:53:28Z',
 'uid': '2290263',
 'user': 'soemisch',
 'version': '20',
 'visible': 'true'}

解析Node的属性,以K:V存储的值。

for (k,v) in node.attrs.items():
    print(k,":",v)
lon : 11.5414971
user : soemisch
timestamp : 2015-10-15T10:53:28Z
id : 398692
uid : 2290263
version : 20
visible : true
changeset : 34651972
lat : 48.1452196

将nodelist转换为Pandas.DataFrame,为了便于显示,只处理了5个node。

import pandas as pd
nodelist2 = []
for node in nodelist[0:10]:
    nodelist2.append(node.attrs)
#print(nodelist2)

df = pd.DataFrame(nodelist2)
df
changeset id lat lon timestamp uid user version visible
0 34651972 398692 48.1452196 11.5414971 2015-10-15T10:53:28Z 2290263 soemisch 20 true
1 34904180 1956100 48.1434822 11.5487963 2015-10-27T14:01:37Z 2385132 MENTZ_TU 43 true
2 10842011 21565151 48.1414994 11.5522715 2012-03-01T20:37:08Z 342705 KonB 4 true
3 9695595 21585828 48.1445431 11.5384205 2011-10-30T16:47:12Z 534662 Q12 17 true
4 9883923 60300474 48.1406915 11.5502820 2011-11-20T13:24:04Z 64536 Michael Forster 4 true
5 2434259 256554156 48.1431978 11.5197388 2009-09-10T10:34:54Z 127922 w3box 4 true
6 11085110 256554158 48.1432360 11.5170168 2012-03-24T14:42:27Z 342705 KonB 5 true
7 9505942 256554152 48.1420008 11.5383182 2011-10-08T19:22:24Z 334153 Alexander Roalter 4 true
8 30794039 1423405650 48.1398728 11.5447444 2015-05-04T23:26:30Z 354141 Anoniman 2 true
9 9212407 1423405651 48.1399051 11.5444005 2011-09-04T20:47:20Z 17085 cfaerber 1 true

将Pandas.DataFrame转为GeoPandas.DataFrame,点生成为GeoSeries。

注意: 需要安装shapely和geopandas包。在anaconda先运行source activate GISpark,然后安装:

conda install -y -c https://conda.anaconda.org/conda-forge fiona  
conda install -y -c https://conda.anaconda.org/conda-forge gdal  
conda install -y -c https://conda.anaconda.org/conda-forge geopandas  
conda install -y -c https://conda.anaconda.org/conda-forge geojson
from shapely.geometry import (Point, LinearRing, LineString, Polygon, MultiPoint)
from geopandas import GeoSeries, GeoDataFrame
from geopandas.base import GeoPandasBase

def node2pandas(nodelist):
    nodelist2 = []
    for node in nodelist[0:10]:
        nodelist2.append(node.attrs)
    df = pd.DataFrame(nodelist2)
    return df

def pandas2geopandas(nodelist):
    pass

def node2geopandas(nodelist):
    df = node2pandas(nodelist)

    ps = []
    ps0 = [1]
    for index, row in df.iterrows():
        #print(index,':',row['lat'],'-',row['lon'])
        ps0[0] = Point(float(row['lon']),float(row['lat']))
        ps.append(ps0[0])

    gs = GeoSeries(ps,crs={'init': 'epsg:4326', 'no_defs': True})        
    geodf = GeoDataFrame({'id' : df["id"],'user' : df["id"], 
                        'lon' : df["lon"],'lat' : df["lat"],
                        'timestamp' : df["timestamp"],'uid' : df["uid"],'version' : df["version"],
                        'geometry' : gs
                        })
    return geodf
gdf = node2geopandas(nodelist)
gdf
geometry id lat lon timestamp uid user version
0 POINT (11.5414971 48.1452196) 398692 48.1452196 11.5414971 2015-10-15T10:53:28Z 2290263 398692 20
1 POINT (11.5487963 48.1434822) 1956100 48.1434822 11.5487963 2015-10-27T14:01:37Z 2385132 1956100 43
2 POINT (11.5522715 48.1414994) 21565151 48.1414994 11.5522715 2012-03-01T20:37:08Z 342705 21565151 4
3 POINT (11.5384205 48.1445431) 21585828 48.1445431 11.5384205 2011-10-30T16:47:12Z 534662 21585828 17
4 POINT (11.550282 48.1406915) 60300474 48.1406915 11.5502820 2011-11-20T13:24:04Z 64536 60300474 4
5 POINT (11.5197388 48.1431978) 256554156 48.1431978 11.5197388 2009-09-10T10:34:54Z 127922 256554156 4
6 POINT (11.5170168 48.143236) 256554158 48.1432360 11.5170168 2012-03-24T14:42:27Z 342705 256554158 5
7 POINT (11.5383182 48.1420008) 256554152 48.1420008 11.5383182 2011-10-08T19:22:24Z 334153 256554152 4
8 POINT (11.5447444 48.1398728) 1423405650 48.1398728 11.5447444 2015-05-04T23:26:30Z 354141 1423405650 2
9 POINT (11.5444005 48.1399051) 1423405651 48.1399051 11.5444005 2011-09-04T20:47:20Z 17085 1423405651 1

保存为shape格式文件。

filename = "osm_test.shp"
gdf.to_file(filename)

查看一下文件列表。

!ls -l -h
总用量 2.8M
-rw-rw-r-- 1 supermap supermap  25K 5月   4 15:17 osm-discovery.ipynb
-rw-rw-r-- 1 supermap supermap 5.0K 4月  24 17:45 osm-overpass.ipynb
-rw-rw-r-- 1 supermap supermap  15K 4月  23 08:23 osm-tag2json.ipynb
-rw-rw-r-- 1 supermap supermap   10 5月   4 15:17 osm_test.cpg
-rw-rw-r-- 1 supermap supermap 5.8K 5月   4 15:17 osm_test.dbf
-rw-rw-r-- 1 supermap supermap 2.7M 5月   4 15:00 osm_test.osm
-rw-rw-r-- 1 supermap supermap  380 5月   4 15:17 osm_test.shp
-rw-rw-r-- 1 supermap supermap  180 5月   4 15:17 osm_test.shx