Lunarpedia:Autostub3/Source
Source code as of the partially successful execution that stopped after 327 entries.
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
# #
# PreAutostub 3 #
# #
# A specialized tool to feed a specific group of #
# HTML sources into Autostub3 #
# #
# PUBLIC DOMAIN #
# #
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
TESTNO = "Gamma_A" #? lost count here
#Parser
def Parser(List):
#Setup variables and buffers
ListPosition = 0 # position in List; forgot why I needed this
TagID = "" # buffer for the current tag
LinkID = "" # buffer for the link name
Name = "" # this is hopefully the correct name for the article
Output = "" # output text
mode = "Text" # what mode we're in
Flag = "" # Is this article flagged for something?
Named = 0 # Is this already named?
Defno = 1 # Which definition?
#Loop
for q in List:
# Text Mode -- dumps text directly to output buffer until put into
# another mode; ends with a '<' and switches to tag mode
if mode == "Text":
if q == "<":
mode = "Tag"
TagID = ""
else: Output += q
# Tag mode -- stores text into TagID buffer; ends with a '>',
# identifies the ID buffer and switches to the appropriate mode or
# outputs the appropriate text and returns to Text mode.
# 'A' -- enter A mode
# 'STRONG' -- enter Strong mode
# 'I' or '/I' -- output "''"
# 'B' or '/B' -- output "'''"
# 'MATH' or '/MATH' -- output '<math>' or '</math>'
# 'SUP' or '/SUP' -- output '<sup>' or '</sup>'
# 'SUB' or '/SUB' -- output '<sub>' or '</sub>'
# 'DD' -- output '<BR/><BR/>'
# 'BR' or 'BR/' -- output '<BR/>'
# 'CITE' or '/CITE' -- output 'CITE' or '/CITE'
# 'IMG' -- set missing image tag; return name of missing image
# P -- replace with nothing
# unknown: return verbatim in angle brackets and switch to Text mode
elif mode == "Tag":
if q == ">":
if ReadTag(TagID) == "A":
mode = "A"
elif ReadTag(TagID) == "STRONG":
if not Named:
mode = "Strong"
else:
mode = "End"
Output += "'''"
elif ReadTag(TagID) == "I":
Output += "''"
mode = "Text"
elif ReadTag(TagID) == "B":
Output += "'''"
mode = "Text"
elif ReadTag(TagID) == "/I":
Output += "''"
mode = "Text"
elif ReadTag(TagID) == "/B":
Output += "'''"
mode = "Text"
elif ReadTag(TagID) == "CITE":
Output += "<ref>"
mode = "Text"
Flag += "R"
elif ReadTag(TagID) == "/CITE":
Output += "</ref>"
mode = "Text"
elif ReadTag(TagID) == "/A":
mode = "Text"
elif ReadTag(TagID) == "P":
Output += "<BR/>"
mode = "Text"
elif ReadTag(TagID) == "MATH":
Output += "<"+TagID.lower()+">"
mode = "Text"
elif ReadTag(TagID) == "/MATH":
Output += "</math>"
mode = "Text"
elif ReadTag(TagID) == "DD":
Output += "<BR/>" # '''"+str(Defno)+".'''"
Defno += 1
mode = "Text"
elif ReadTag(TagID) == "IMG":
Output += "'''Missing Image:"+TagID+"'''"
mode = "Text"
Flag += "I"
else:
Output += "<"+TagID+">"
mode = "Text"
else: TagID += q
# A mode -- stores text into LinkID buffer; ends with a '<', outputs
# a Wikilink version of the text accumilated in its buffer.
elif mode == "A":
if q == "<":
if LinkID != "":
Output += "[["+CapIt(LinkID)+"|"+LinkID+"]]"
mode = "End"
LinkID = ""
else:
LinkID += q
# Strong mode -- stores text into Name buffer; ends with a '<' and
# capitalizes
elif mode == "Strong":
if q == "<":
Name = CapIt(Name)
mode = "End"
Named = 1
else:
Name += q
# Weak mode -- when the boson is hit by a neutrino it changes the
# flavor of the nearest... Oh. Sorry, wrong model.
# End mode -- ignores all text until the ending '>' is recieved;
# switches to Text mode. Nested tags are not supported and it is
# assumed that any new tag is the correct end tag.
elif mode == "End":
if q == ">":
mode = "Text"
# Um, what mode are we in again?
else: raise TypeError, "Parser internal error: WTF is "+mode+" mode??"
# Incriment ListPosition
# Q: Do I still need this or did I eliminate its usefulness?
ListPosition += 1
return [Output, Name, Flag]
def KingMe(stringy):
'''capitalizer that doesn't uncapitalize'''
return stringy[0].upper()+stringy[1:]
def ReadTag(rawtext):
# should return the leftmost part of the string in SCREAMING CAPITALS.
return rawtext.split(' ',1)[0].upper().strip('\\. ')
def CapIt(Name):
# Needs to consistently return a Capitalized Form of whatever name is
# plugged into it
Name = Name.split()
CapName = ""
Terms = 0
for q in Name:
if Terms == 0:
CapName = KingMe(q.strip(',.'))
Terms = 1
else:
CapName = CapName + " " + KingMe(q.strip(',.'))
return CapName
def FindRedir(gunk):
link = []
foundlink = 0
buffer = ""
mode = "ignore"
Count = 0 # was added only for debugging
Modemap = "" # also added only for debugging
for q in gunk:
if mode == "ignore":
Modemap += "i"
if q == "[":
mode = "maybe"
elif mode == "maybe":
Modemap += "m"
if q == "[":
mode = "link"
else: mode = "ignore" #; print "FAILED LINK"
elif mode == "link":
Modemap += "L"
if ((q == "|") or (q == "]")):
foundlink += 1
mode = "ignore"
link += [buffer]
buffer = ""
# print "found link at "+str(Count)
else: buffer += q
Count += 1
#print Modemap
#print "foundlink ==", foundlink, link
if foundlink:
if len (link) > 1:
# print link
themax = 0 # length of longest
theout = None # which is longest
for q in link:
# print len(q), themax
if len(q) > themax:
themax=len(q)
theout = q
# print theout
return theout
else: return link[0]
else: return None
#Get raw html
#RawML = raw_input()
RawML = ""
tsv=open('/home/Luna/Raw_SP-7_A.txt')
RawML += tsv.read()
tsv.close()
#/home/Luna/Raw_SP-7_B.txt
#tsv=open('/home/Luna/Raw_SP-7_B.txt_')
#RawML += tsv.read()
#tsv.close()
#
#tsv=open('/home/Luna/Raw_SP-7_C.txt_')
#RawML += tsv.read()
#tsv.close()
#Parse Page into slices using <DT> tags
RawList = RawML.split('<DT>')
#Create ProtoArticles and Redirects and stuff them full of proto-articles
ProtoArticles = []
Redirects = []
bailout = 0
punchout = 0
for q in RawList:
if q != "":
artie = Parser(q)
if artie[0].count("constellation"):
print "constellation check removing entry:",artie[1]
elif (artie[0].count('=',0,5) or artie[0].upper().count('SEE',0,20)):
#print artie
artie2 = FindRedir(artie[0])
#print artie2
if artie2:
if artie[1] == "ADF (abbr)": artie[1] = "ADF"
elif artie[1] == "ADP (abbr)": artie[1] = "ADP"
elif artie[1] == "AFC (abbr)": artie[1] = "AFC"
elif artie[1] == "Andromeda (abbr And Andr)": artie[1] = "Andromeda"
elif artie[1] == "Antlia (abbr Ant Antl.)": artie[1] = "Antlia"
elif artie[1] == "APU (abbr)": artie[1] = "APU"
elif artie[1] == "AU (abbr)": artie[1] = "AU"
elif artie[1] == "Atomic Mass Unit (abbr Amu)": artie[1] = "Atomic Mass Unit"
elif artie[1] == "Atomic Weight Unit (abbr Awu)": artie[1] = "Atomic Weight Unit"
elif artie[1] == "Anti-g Suit": artie[1] = "Anti G Suit"
elif artie[1] == "": artie[1] = ""
elif artie[1] == "": artie[1] = ""
elif artie[1] == "": punchout = 1
#table for collections and bailouts
if not punchout: Redirects += [(artie[1],artie2)]
else: punchout = 0
#print "artie2",artie2
#print Parser(q)[2]
#print "*****"
else:
#name patch list
if artie[1] == "Acoustic Velocity (": artie[1] = "Acoustic Velocity"
elif artie[1] == "(abbr ADC)": artie[1] = "ADC"
elif artie[1] == "AND-NOT Gate = Exclusive OR Circuit": bailout = 1
elif artie[1] == "Angular Acceleration (": artie[1] = "Angular Acceleration"
elif artie[1] == "Angular Velocity (": artie[1] = "Angular Velocity"
elif artie[1] == "Astronomical Unit (abbr AU)": artie[1] = "Astronomical Unit"
elif artie[1] == "Atomic Weight (abbr At Wt.)": artie[1] = "Atomic Weight"
elif artie[1] == "Automatic Direction Finder (abbr ADF)": artie[1] = "Automatic Direction Finder"
elif artie[1] == "Automatic Frequency Control (abbr AFC)": artie[1] = "Automatic Frequency Control"
elif artie[1] == "Automatic Gain Control (abbr AGC)": artie[1] = "Automatic Gain Control"
elif artie[1] == "Auxiliary Power Unit (abbr APU)": artie[1] = "Auxiliary Power Unit"
elif artie[1] == "Axis (plural Axes)": artie[1] = "Axis"
elif artie[1] == "Acceleration Of Gravity (": artie[1] = "Acceleration Of Gravity"
elif artie[1] == "Air Position Indicator (abbr API)": artie[1] = "Air Position Indicator"
elif artie[1] == "[[]]": bailout = 1
elif artie[1] == "Alphanumeric (alphabet Plus Numeric)": artie[1] = "Alphanumeric"
elif artie[1] == "Ampere (abbr A)": artie[1] = "Ampere"
elif artie[1] == "AND Gate And Gate": artie[1] = "AND Gate"
elif artie[1] == "Anti-matter": artie[1] = "Antimatter"
elif artie[1] == "Anti-particle": artie[1] = "Antiparticle"
elif artie[1] == "": bailout = 1
elif artie[1] == "Alga (plural Algae)": artie[1] = "Algae"
elif artie[1] == "B��y Chair": artie[1] = "Bárány Chair"
elif artie[1] == "Barn (Abbr B)": artie[1] = "Barn"
elif artie[1] == "(abbr BT Sequencing)": artie[1] = "BT Sequencing"
elif artie[1] == "Baum�scale (abbr Be)": artie[1] = "Baumé scale"
elif artie[1] == "Bernoulli Law Or Bernoulli Theorem": artie[1] = "Bernoulli's Law"
elif artie[1] == "Bluntness (": artie[1] = "Bluntness"
elif artie[1] == "Boiling Point (abbr Bp)": artie[1] = "Boiling Point"
elif artie[1] == "Bohr Magneton Electronic Bohr Magneton": artie[1] = "Bohr Magneton"
elif artie[1] == "Boltzmann Constant (symbol": artie[1] = "Boltzmann Constant"
elif artie[1] == "Btu (abbr)": artie[1] = "BTU"
elif artie[1] == "Cae Cael": bailout = 1
elif artie[1] == "Calorie (abbr Cal)": artie[1] = "Calorie"
elif artie[1] == "Caml": bailout = 1
elif artie[1] == "Cam Caml": bailout = 1
elif artie[1] == "Cap Capr": bailout = 1
elif artie[1] == "Car Cari": bailout = 1
elif artie[1] == "Cet Ceti": bailout = 1
elif artie[1] == "Cha Cham": bailout = 1
elif artie[1] == "Cir Circ": bailout = 1
elif artie[1] == "CMa C Maj": bailout = 1
elif artie[1] == "Carrier Wave (abbr Cw)": artie[1] = "Carrier Wave"
elif artie[1] == "Cathode-ray Tube (abbr CRT)": artie[1] = "Cathode Ray Tube"
elif artie[1] == "Cathode-ray Oscilloscope": artie[1] = "Cathode Ray Oscilloscope"
elif artie[1] == "Celsius Temperature Scale (abbr C)": artie[1] = "Celsius Temperature Scale"
elif artie[1] == "Centigrade Temperature Scale (abbr C)": artie[1] = "Centigrade Temperature Scale"
elif artie[1] == "Centimeter (abbr Cm)": artie[1] = "Centimeter"
elif artie[1] == "Centimeter-gram-second System (abbr Cgs)": artie[1] = "Centimeter-gram-second System"
elif artie[1] == "Centipoise (abbr Cp)": artie[1] = "Centipoise"
elif artie[1] == "Cermet [ceramic + Metal]": artie[1] = "Cermet"
elif artie[1] == "Circle Of Equal Probability (abbr CEP)": artie[1] = "Circle Of Equal Probability"
elif artie[1] == "Circular Dispersion (abbr CD)": artie[1] = "Circular Dispersion"
elif artie[1] == "Co": artie[1] = "Co (prefix)"
elif artie[1] == "Coefficient (abbr Coeff)": artie[1] = "Coefficient"
elif artie[1] == "Coherent Oscillator (abbr Coho)": artie[1] = "Coherent Oscillator"
elif artie[1] == "Comes (plural Comites)": artie[1] = "Comes"
elif artie[1] == "Continuous Waves (abbr CW)": artie[1] = "Continuous Waves"
elif artie[1] == "CMi C Min": bailout = 1
elif artie[1] == "Cnc Canc": bailout = 1
elif artie[1] == "Colm": bailout = 1
elif artie[1] == "Com Coma": bailout = 1
elif artie[1] == "Copy": bailout = 1
elif artie[1] == "Cor A": bailout = 1
elif artie[1] == "Cor B": bailout = 1
elif artie[1] == "CrA Cor A": bailout = 1
elif artie[1] == "Coolant (": artie[1] = "Coolant"
elif artie[1] == "Correlation Tracking And Ranging (abbr Cotar)": artie[1] = "Correlation Tracking And Ranging"
elif artie[1] == "Correlation Tracking And Triangulation (abbr Cotat)": artie[1] = "Correlation Tracking And Triangulation"
elif artie[1] == "Coulomb (abbr C)": artie[1] = "Coulomb"
elif artie[1] == "Corv": artie[1] = ""
elif artie[1] == "Corvus": artie[1] = ""
elif artie[1] == "CRT (abbr)": bailout = 1
elif artie[1] == "Curie (abbr C)": artie[1] = "Curie"
elif artie[1] == "Caelum (abbr Cae Cael)": artie[1] = "Caelum"
elif artie[1] == "Bit Rate": artie[1] = "Bitrate"
elif artie[1] == "Black Body Blackbody": artie[1] = "Black Body"
elif artie[1] == "Black-body Radiation": artie[1] = "Black Body Radiation"
elif artie[1] == "Body Of Revolution": artie[1] = "Body of Revolution"
elif artie[1] == "": artie[1] = ""
elif artie[1] == "CrB Cor B": bailout = 1
elif artie[1] == "Crt Crat": bailout = 1
elif artie[1] == "Cruc": bailout = 1
elif artie[1] == "Crv Corv": bailout = 1
elif artie[1] == "Cvn C Ven": bailout = 1
elif artie[1] == "Cyg Cygn": bailout = 1
elif artie[1] == "Aps Apus": bailout = 1
elif artie[1] == "Aql Aqil": bailout = 1
elif artie[1] == "Ari Arie": bailout = 1
elif artie[1] == "Aur Auri": bailout = 1
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
#elif artie[1] == "": artie[1] = ""
elif artie[1] == "Ares": bailout = 1
#elif artie[1] == "": bailout = 1
#elif artie[1] == "": bailout = 1
#elif artie[1] == "": bailout = 1
#elif artie[1] == "": bailout = 1
#elif artie[1] == "": bailout = 1
#elif artie[1] == "": bailout = 1
#add to final list Acceleration Of Gravity (
if not bailout: ProtoArticles += [artie]
else: bailout = 0
#Redirect-Creation-O-Mat
if artie[1].count(" "):
Redirects += [[artie[1].capitalize(),"#REDIRECT: [["+artie[1]+"]]"]]
##print "creating: "+artie[1].capitalize()+" -- #REDIRECT: [["+artie[1]+"]]"
print len(ProtoArticles), "articles"
print len(Redirects), "redirects"
##qqq=0
##for q in ProtoArticles:
## if q[1] == "Aurora": print "aurora is",qqq
## if q[1] == "Absolute Magnitude": print "absolute magnitude is",qqq
## qqq += 1
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
# #
# Definition Autostub Generator #
# #
# Public Domain #
# #
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
# #
# from PreAutostub3: #
# #
# ProtoArticles (article text, article name, flags) #
# Redirects (redirect name, redirect to this article) #
# #
def XMLproof(textish):
outp = ""
for q in textish:
if q == "<": outp += "<"
elif q == ">": outp += ">"
elif q == "&": outp += "&"
elif q == '"': outp += """
##elif q == ":": outp += "%3A"
else: outp += q
return outp
def StartXML():
out = [['<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">\n']]
out += [' <siteinfo>\n']
out += [' <sitename>Lunarpedia</sitename>\n']
out += [' </siteinfo>\n']
return out
def EndXML():
out = [['</mediawiki>\n']]
return out
def ArtXML(title, contrib, date, text):
'''XML markup for article in file
title -- title of article
contrib -- name of script (ie Autostub2)
text -- the article
'''
out = [[' <page>\n']]
out += [' <title>'+title+'</title>\n']
out += [' <revision>\n']
out += [' <timestamp>'+date+'</timestamp>']
out += [' <contributor>\n']
out += [' <username>'+contrib+'</username>\n']
out += [' </contributor>\n']
##out += [' <text xml:space="preserve">'+text+'</text>']
out += [' <text xml:space="preserve">']
##print text
out += text
out += ['</text>\n']
out += [' </revision>\n']
out += [' </page>\n']
return out
def linebrk(listish):
newlist = []
for q in listish:
newlist += [q+"\n"]
return newlist
def Stringify(listish):
stringish = ""
for q in listish:
if type(q) == type('str'):
stringish += q
#print "str"
elif type(q) == type([]):
stringish += Stringify(q)
#print "list"
else:
print type(q)
print q
raise TypeError, "non-string non-list!!!"
return stringish
def dodef(thisentry):
"""Create stub article from sequence
"""
global TESTNO
#
# 0 -- body of definition article
# 1 -- name of article
# 2 -- flags
# I: missing image
# R: needs references section
#
#%#%#%#%#%#%#%#%#%#%#%#%#%#%%#%#%#%#%#
#
#
#%#%#%#%#%#%#%#%#%#%#%#%#%#%%#%#%#%#%#
#
# start generating the article here:
#
to_out = []
##to_out = ["{{Script Test}}"]
to_out += ["{{Autostub}}"]
to_out += ["{{Initial Proof Needed}}"]
##print thisentry[0]
##print "*****"
##print thisentry[1]
##print "*****"
##print thisentry[2]
to_out += ["'''"+thisentry[1]+"'''"]
to_out += [thisentry[0]]
#print "*****"
#print to_out
to_out += ["==References=="]
to_out += ["''This article is based on NASA's [[NASA SP-7|Dictionary of Technical Terms for Aerospace Use]]''"]
if thisentry[2].count("R"):
to_out += ["<references/>"]
to_out += ["[[Category%3ADefinitions]]"]
to_out += ["[[Category%3ANASA SP-7]]"]
if thisentry[2].count("I"):
to_out += ["[[Category%3ADefinitions with Missing Images]]"]
##to_out += [""]
##to_out += ["<!-- Generated by a gamma candidate version of Autostub3 (Test "+TESTNO+") -->"]
to_out = linebrk(to_out)
return to_out
def doredir(thisentry):
"""Create redirect from two item sequence
"""
#
# 0 -- name of redirect
# 1 -- redirect to this article
#
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
#
#
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#
#
# start generating the redirect here:
#
return ["#REDIRECT: [["+thisentry[1]+"]]"]
def doare(tup):
proofme = doredir(tup)
proofed = []
for q in proofme:
proofed += XMLproof(q)
return ArtXML(tup[0], "Autostub3", "2007-03-07T18:00:00Z", proofed)
def doanart(tup):
proofme = dodef(tup)
proofed = []
for q in proofme:
proofed += XMLproof(q)
return ArtXML(tup[1], "Autostub3", "2007-03-07T18:00:00Z", proofed)
execute = StartXML()
for q in ProtoArticles:
execute += doanart(q)
for q in Redirects:
execute += doare(q)
##test4 += doanart(ProtoArticles[600])
##test4 += doanart(ProtoArticles[690])
##test4 += doare(Redirects[130])
##test4 += doare(Redirects[140])
##test4 += doare(Redirects[159])
##print doare(ProtoArticles[50])
##print "*****"
##print
##print dodef(ProtoArticles[117])
execute += EndXML()
execute = Stringify(execute)
##print test4
do_xml=open("/home/Luna/autostub3test"+TESTNO+".xml", 'w')
do_xml.write(execute)
do_xml.close()
#create and sort table to avoid name collisions
##testlist = []
##redund = []
##
##for q in ProtoArticles:
## testlist += ["[["+q[1]+"]] article<BR/>"]
## redund += [q[1]]
##
##for q in Redirects:
## testlist += ["[["+q[0]+"]] redirect<BR/>"]
## redund += [q[0]]
##
##print "sorting"
##testlist.sort()
##redund.sort()
##
###check for interproject name collisions
##past = "configio.$$$"
##for q in redund:
## if q == past: print "Collision:",q
## past = q
##
##print "sorted:"
##print
##for q in testlist: print q





