passerium: Qin scores & history

Objective - read in a webpage, find elements (score/song name, explanatory link) and create a dictionary.
The impressive source: http://silkqin.com/zh02qnpu.htm

Scrapping source

1.extracting and renaming files
2.Correctly obtaining encoded characters - since the webpage contains Chinese characters, need to ensure they are captured properly

import requests
url = 'http://silkqin.com/zh06hear.htm'
response = requests.get(url)
page_content = requests.get(url).content  # returns bytes <- this extra step allows detection for special (e.g. Chinese characters)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page_content, 'lxml')
#check how it looks
soup.contents

Results in: title聽絲弦古琴title
meta content=“text/html; charset=utf-8” http-equiv=“content-type”
meta content=“聽絲弦古琴” name=“description” meta content=“琴、古琴、聽琴、聽古琴、聽絲弦琴、聽絲弦古琴、絲弦、絲絃、絲線、丝弦、絲絃琴、絲弦琴、絲線琴、絲弦古琴、絲絃古琴、絲線古琴、絲桐、唐世璋、John Thompson” name=“keywords”

import re
from urllib.request import urlretrieve
from urllib.request import urlopen

html = urlopen('http://silkqin.com')
baseurl='http://silkqin.com/'
#The 'a' tag in the html does not have any text directly, but it contains a 'h3' tag that has text. 
all_links = [link.get("href") for link in soup("a")]
all_links
#get rid of none otherwise sub lists generated gives None type error
#to read on None type - https://stackoverflow.com/questions/3887381/typeerror-nonetype-object-is-not-iterable-in-python
clean = [x for x in all_links if x is not None]
# now filter for that directory
links_htm = [k for k in clean if 'htm' in k and '02qnpu' in k] 
#and 'htm#' not in k]#and '\#' not in k and '\~' not in k]

There were 164 scores listed,not 234 that results from checking length of this list.

#note-- don't reuse counter alink fromprevious= -> cannot force it into an integer as it becomes a list element of thelist in the for loop
#trying to use it as a counter yields TypeError: list indices must be integers or slices, not str
counter = 0
for alink in links_htm:
    urlretrieve((baseurl + links_htm[counter]), (links_htm[counter].rsplit('/', 1)[-1]))
    #the split takes all characters after last slash
    #regex way -- re.sub(r'^.+/([^/]+)$', r'\1', 'dsf/we/sdfl.htm')
    #more https://stackoverflow.com/questions/7253803/how-to-get-everything-after-last-slash-in-a-url
    counter += 1

What happened? Checking the directory, there are 210 of these annotation (and lyrics) html files downloaded.
Let’s collect the downloaded ones (anything with htm) using glob below, and compare against the annotation links list.
Since the glob collection has no subdirectory/nesting (besides the # page bookmarks), let’s strip those from the links list as well

import glob
downloadedhtmfiles = []
for file in glob.glob("*.htm"):
    downloadedhtmfiles.append(file)

links_htm_temp=list(range(0,len(links_htm)))  # there will be error otherwise if list is not intialized, since we don't use append below
counter = 0
for alink in links_htm:
    links_htm_temp[counter] = re.sub(r'^.+/([^/]+)$', r'\1', links_htm[counter])
    #the split takes all characters after last slash
    #regex way -- re.sub(r'^.+/([^/]+)$', r'\1', 'dsf/we/sdfl.htm')
    #more https://stackoverflow.com/questions/7253803/how-to-get-everything-after-last-slash-in-a-url
    counter += 1
#links_htm_temp[0]

def Diff(li1, li2): 
    return (list(set(li1) - set(li2)))  
print(Diff(links_htm_temp, downloadedhtmfiles))

Output:
[‘tg06gjq.htm#lyrchi’, ‘xl127ysc.htm#jzymusic’, ‘lh00toc.htm#p5’, ‘tingqinyin.htm#melody’, ‘daoyi.htm#qsdfn’, ‘yqwd.htm’, ‘xl028yyg.htm#chilyr’, ‘tg32cjq.htm#1525cjwt’, ‘xl132src.htm#linzhong’, ‘jiukuang.htm#chilyr’, ‘tg36kcyh.htm#music’, ‘03slgj.htm#kzhyy’, ‘tg01nfg.htm#lyrics’, ‘hw02qpy.htm’, ‘xl096yts.htm#lyrics’, ‘xl127ysc.htm#ysymusic’, ‘xl054cwy.htm#mjyfn’, ‘27wjctrans.htm#record’, ‘1709qfq.htm#1840muslyr’, ‘tg28frsg.htm#chilyr’, ‘daoyi.htm#lyrchi’, ‘tg10ysc.htm#chilyr’, ‘xl054cwy.htm#jy’, ‘qx14wywq.htm’, ‘tg32cjq.htm#clyrics’, ‘xl000toc.htm#p16’, ‘xl021fl.htm#feidianyinfn’, ‘fm23ygsd.htm#chilyr’, ‘xl098byd.htm#chilyrfn’, ‘cx38xsq.htm#lyrics’, ‘tg24hzd.htm#chilyr’, ‘zy13ygsd.htm#v1’, ‘fx33gg.htm#lyricsfn’, ‘fx42zwy.htm#chilyr’, ‘sq01dsc.htm#dinghuiyinfn’, ‘xl046yz.htm#1530’, ‘1709qfq.htm#1709muslyr’, ‘tg02sqc.htm#lyrics’, ‘daoyi.htm#qsdlyr’, ‘tg03xfy.htm#chilyr’, ‘sj03qjj.htm#chilyr’, ‘hw15fhts.htm’, ‘fx40dmyt.htm#chilyr’, ‘tg09wwq.htm#muslyr’, ‘tg32cjq.htm#1539cj’, ‘ty28skj.htm’, ‘lq12mss.htm’, ‘fm03qjwd.htm#chilyr’, ‘fx27wjc.htm#chilyr’, ‘xl041jyb.htm#qingyeyin’, ‘qx09lhxx.htm’, ‘ty28skj.htm#skjmp3’, ‘ylcx.htm#cgyfn’, ‘xl041jyb.htm#chilyr’, ‘yltrans.htm’, ‘fx32dyq.htm#chilyr’, ‘sq01dsc.htm#taicouyifn’, ‘tg16ysc.htm#chilyr’, ‘ylcx.htm#ylcxmusic’, ‘xl159qxb.htm#byyfn’, ‘sq18ghy.htm#daguanyinfn’, ‘fx45gjx.htm#xllyrfn’, ‘jiukuang.htm#lyrics’, ‘03slgj.htm#gd’, ‘fx31lsm.htm#chilyr’, ‘tg08ksc.htm#lyrics’, ‘ty6qcby.htm#gy’, ‘tg07wwc.htm#music’, ‘xl046yz.htm#chilyr’, ‘xl007gky.htm#chongheyinfn’, ‘xl159qxb.htm#qyyfn’, ‘xl155fqh.htm#chilyr’, ‘tg25gqlc.htm#chilyr’, ‘tg35gqf.htm#chilyr’, ‘sz03olwj.htm’]

A dictionary to cross-reference

Playing with Regex and making the first pairing for dict

take all htm, strip out #… match blurb.htm"> and First let’s reset the links list in case of any accidental changes before

#repeat of above code (in case run from this segment)
# Find links
all_links = [link.get("href") for link in soup("a")]
all_links
clean = [x for x in all_links if x is not None]
#links_htm = [k for k in clean if 'htm' in k and '02qnpu' in k and 'htm#' not in k]#and '\#' not in k and '\~' not in k]
#links_htm = [k for k in links_htm if '02qnpu' in k]
links_htm = [k for k in clean if 'htm' in k and '02qnpu' in k]

links_htm_clean=links_htm # if I reassign below directly in re.sub, it seems to overwrite the original as well
links_htm_clean[1] = re.sub(r'.*\/', r'', links_htm_clean[1]) #pat1.*pat2   any number of characters between pat1 and pat2
links_htm_clean[1] = re.sub(r'\#.*', r'', links_htm_clean[1]) 
print(links_htm_clean[1]) 
print(links_htm[3])
len(links_htm_clean)

#throwing the tested element into a loop
# links_htm = [k for k in clean if 'htm' in k and '02qnpu' in k]
# gets rid of slashes and anything preceding slash
links_htm_clean=links_htm
counter=0
for elem in links_htm_clean:
    links_htm_clean[counter] = re.sub(r'.*\/', r'', links_htm_clean[counter]) #pat1.*pat2   any number of characters between pat1 and pat2
#    links_htm_clean[counter] = re.sub(r'\#.*', r'', links_htm_clean[counter]) # works but don't want to remove the # bc sometime that signifies diff song 
#    print(links_htm_clean[counter]) 
    counter +=1
links_htm_clean[1] = re.sub(r'\#.*', r'', links_htm_clean[1]) 
print(links_htm_clean[1]) 
print(links_htm[3])
len(links_htm_clean)

# Variation 1 - cluster annotation / song name+record link
astring='<a href="http://silkqin.com/02qnpu/03slgj.htm#kzhyy">開指黃鶯吟</a>（<a href="http://silkqin.com/06hear/myrec/01tangsong/00kzhyy.mp3">聽</a>'
#m=re.split('(.htm\S+?>)',astring) 
m=re.split('.htm\S+?>',astring)  #cuts by end of first '>' for a list of two being the ahref annotation link, then song name & recording link
n=re.sub('(.mp3)\S+','',astring) #cuts everything from the '.mp3' onward to get rid of 聽</a>']
#\S = a non whitespace chara,
#+ multipe \S but add ? for as few as possible
#() keeps the separator within result
print(m)
print(n)

# Variation 2 - cluster annotation+song name / record link
astring='<a href="http://silkqin.com/02qnpu/03slgj.htm#kzhyy">開指黃鶯吟</a>（<a href="http://silkqin.com/06hear/myrec/01tangsong/00kzhyy.mp3">聽</a>'
#m=re.split('(.htm\S+?>)',astring) 
m=re.split('</a>（<a href="',astring) 
n=re.sub('(.mp3)\S+','',astring)
print(m) #a list of two
print(n)

Continuing with variation 2, let’s regex out the typical patterns surrounding the song name.

# Song Name: 
o=re.sub(r'.*\/', r'', m[0]) #first extract everything after the last / in the first element -> '03slgj.htm#kzhyy">開指黃鶯吟'
p=re.split(r'\">', o) # then split by the "> pattern between annotation link and song name ->['03slgj.htm#kzhyy', '開指黃鶯吟']
p[1] #the second element yields song name

Build the dictionary

Recall how to set up a dictionary. Then fit in song name and file name extracted above (and hope the pattern holds)

#print the soup as string text for use 
Ssoup=str(soup)
print(Ssoup,  file=open('Ssoup.txt', 'w',encoding='utf-8-sig'))
#for song titles,extract from the mass of text in SSoup file
#sub-example "<br/><a href="06hear/myrec/1491/zy08ygd.mp3"><b>聽漁歌調</b></a>"
counter=0
sep=[i for i in links_rec if i in Ssoup]
for elem in sep:
    sep = [i for i in links_rec if i in Ssoup]
    lsep=len(sep[counter]) #length of the recording file name
    idx = Ssoup.find(sep[counter]) #note where the recordingfile name is in the Ssoup string
    _idx=idx-lsep #set starting index back the length of recording file name
    TitleName[counter]=(Ssoup[_idx:idx-12])
    TitleName[counter]=TitleName[counter].split(sep=">",maxsplit=1)[1] # cut everything before <b> which precedes title
    TitleName[counter]=TitleName[counter].split(sep="</",maxsplit=1)[0] # cut everything behind </b> which follows title, keeping first element (Title)
    counter+=1

#piecemeal testing looking for explanation htm files which just precede song titles
teststr="<a href=\"balshblahs<a href=\"http://silkqin.com/02qnpu/32zczz/tingqinyin.htm\">聽琴吟</a>"
teststr
idx = Ssoup.find("古風操")
idx

idx_ = Ssoup[idx-180:idx].rfind('<a href=\"') #reverse find looks forward, so set it some number back from index
print(Ssoup[idx-180+idx_:idx])

o=re.sub(r'.*\/', r'', teststr)
o

# trying this in the text string Ssoup 
idx = Ssoup.find(elem) #note where the file name is in the Ssoup string #'秋風辭'
idx_ = Ssoup[idx-50:idx].rfind('<a href=\"') 
idx_=idx-50+idx_
idx_
Ssoup[idx_:idx]
teststr=Ssoup[idx_+9:idx-2]
teststr=re.sub(r'.*\/', r'', teststr)
teststr

Readying for loop by setting up htm list “explan” and cutting out first element which is a blank for some reason

TitleName=TitleName[1:]
explan=TitleName
#alternatively extract title names from previously compiled dictionary:RecCatalogue.keys()

looping this search

#example "<br/>"<a href=\"http://silkqin.com/02qnpu/32zczz/tingqinyin.htm\">聽琴吟</a>"
teststr="blob"
#remSsoup=Ssoup
sep = [i for i in TitleName if i in Ssoup]
counter=0
for elem in sep:
    sep = [i for i in TitleName if i in Ssoup]
    idx = Ssoup.find(elem) #find where songtitle is
    idx_ = Ssoup[idx-50:idx].rfind('<a href=\"') #then skip 50 characters back and look for match of a href link with highest index (closest to song title)
    idx_=idx-50+idx_
    teststr=Ssoup[idx_+9:idx-2] #cut out the a href frames brackets
    explan[counter]=re.sub(r'.*\/', r'', teststr) # get rid of everything before last slash in htm link
    counter+=1
explan

Outputt explanatory htm’s bby song

pyt{python, eval=FALSE,include=TRUE} #HtmCatalogue=dict(zip(TitleName, explan)) HtmCatalogue ``` hSuccessful output:

Qin scores & history

Authors

Affiliations

Published

DOI