Thursday, September 24, 2015

Data Wrangling with MongoDB Lesson 6.11 Improving Street Names

"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd.": "Road",
            "Ave": "Avenue",
            "N." : "North"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    #print elem.attrib['k']
    #print elem.attrib['v']
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    print street_types
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    # YOUR CODE HERE
    betterNameParts= []

    name = name.split()
    for index, part in enumerate(name):
        if part in mapping.keys():
            betterNameParts.append(mapping[part])
        else:
            betterNameParts.append(part)
       
    betterName = ' '.join(betterNameParts)

    return betterName


def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

Wednesday, September 23, 2015

numpy.ndarray doesn't have count attribute

First convert it into a list, by using list() function. Then use the count method to count.

list(nda).count(1)

Tuesday, September 15, 2015

Python code to process CPC 3007 data

# this code process the one minutes cpc data and convert it to each seconds

import xlrd, xlwt, datetime
from array import array

fileName1 = '06082015.xlsx'
f1WorkBook = xlrd.open_workbook(fileName1)

#check sheet names
sheetName = f1WorkBook.sheet_names()
# select the first sheet
f1Sheet1 = f1WorkBook.sheet_by_index(0)
sn = 19
en = 147
#copy and paste the header lines, row 0 to 17
header = [f1Sheet1.row(r) for r in range(18)]
#header = [f1Sheet1.cell_value(row, col) for col in range(2) for row in range(18)]
print header[0]


### ----------This section deals with header---------------------------
newWorkBook = xlwt.Workbook()
newsheet = newWorkBook.add_sheet('processed_'+fileName1[0:8])

for rowIndex, rowValue in enumerate(header):
    for colIndex, cellValue in enumerate(rowValue):
        newsheet.write(rowIndex, colIndex, cellValue.value)
     
###-------This part deals with the time column---------------------------------
time = [f1Sheet1.cell_value(row, col) for col in range(2) for row in range(sn-1,en)]
newTime = [0]*((en-sn)*60) # create an empty vector
newTime[0] = time[0]
print newTime[0]

for i in range(0, en-sn):
    for j in range(0,60):
        #if j+60*i <70: font="">
         #   print j+60*i
        newTime[j+60*i] = newTime[0]+(j+60*i)/(1440*60.0)

for rowIndex in range(18, 18+(en-sn)*60):
    #print rowIndex
    newsheet.write(rowIndex, 0, newTime[rowIndex-18])  
 
###---------This section deals with the concentrations -----------------------

conc = [f1Sheet1.cell_value(row,col) for col in range(1,2) for row in range(sn-1, en)]
newConc = [0]*((en-sn)*60)
for i in range(0, en-sn):
    for j in range(0,60):
        newConc[j+60*i] = conc[i]

for rowIndex in range(18, 18+(en-sn)*60):
    newsheet.write(rowIndex, 1, newConc[rowIndex-18])

### ------------------Save file  
newWorkBook.save('processed_'+fileName1[0:8]+'.xls')

my-alpine and docker-compose.yml

 ``` version: '1' services:     man:       build: .       image: my-alpine:latest   ```  Dockerfile: ``` FROM alpine:latest ENV PYTH...